51 files changed, 1235 insertions, 1012 deletions
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.cpp
index aab76d27ef11..d28f38e42430 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.cpp
@@ -173,14 +173,7 @@ constexpr AMDGPUFunctionArgInfo AMDGPUFunctionArgInfo::fixedABILayout() {
 const AMDGPUFunctionArgInfo &
 AMDGPUArgumentUsageInfo::lookupFuncArgInfo(const Function &F) const {
   auto I = ArgInfoMap.find(&F);
-  if (I == ArgInfoMap.end()) {
-    if (AMDGPUTargetMachine::EnableFixedFunctionABI)
-      return FixedABIFunctionInfo;
-
-    // Without the fixed ABI, we assume no function has special inputs.
-    assert(F.isDeclaration());
-    return ExternFunctionInfo;
-  }
-
+  if (I == ArgInfoMap.end())
+    return FixedABIFunctionInfo;
   return I->second;
 }
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
index f0aadab3302f..b4ebc7d7d75f 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
@@ -112,6 +112,17 @@ static bool isDSAddress(const Constant *C) {
   return AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS;
 }
 
+/// Returns true if the function requires the implicit argument be passed
+/// regardless of the function contents.
+static bool funcRequiresImplicitArgPtr(const Function &F) {
+  // Sanitizers require the hostcall buffer passed in the implicit arguments.
+  return F.hasFnAttribute(Attribute::SanitizeAddress) ||
+         F.hasFnAttribute(Attribute::SanitizeThread) ||
+         F.hasFnAttribute(Attribute::SanitizeMemory) ||
+         F.hasFnAttribute(Attribute::SanitizeHWAddress) ||
+         F.hasFnAttribute(Attribute::SanitizeMemTag);
+}
+
 namespace {
 class AMDGPUInformationCache : public InformationCache {
 public:
@@ -296,7 +307,7 @@ struct AAUniformWorkGroupSizeFunction : public AAUniformWorkGroupSize {
 
     bool AllCallSitesKnown = true;
     if (!A.checkForAllCallSites(CheckCallSite, *this, true, AllCallSitesKnown))
-      indicatePessimisticFixpoint();
+      return indicatePessimisticFixpoint();
 
     return Change;
   }
@@ -339,7 +350,17 @@ struct AAAMDAttributesFunction : public AAAMDAttributes {
 
   void initialize(Attributor &A) override {
     Function *F = getAssociatedFunction();
+
+    // If the function requires the implicit arg pointer due to sanitizers,
+    // assume it's needed even if explicitly marked as not requiring it.
+    const bool NeedsImplicit = funcRequiresImplicitArgPtr(*F);
+    if (NeedsImplicit)
+      removeAssumedBits(IMPLICIT_ARG_PTR);
+
     for (auto Attr : ImplicitAttrs) {
+      if (NeedsImplicit && Attr.first == IMPLICIT_ARG_PTR)
+        continue;
+
       if (F->hasFnAttribute(Attr.second))
         addKnownBits(Attr.first);
     }
@@ -500,6 +521,9 @@ struct AAAMDFlatWorkGroupSize
     std::tie(MinGroupSize, MaxGroupSize) = InfoCache.getFlatWorkGroupSizes(*F);
     intersectKnown(
         ConstantRange(APInt(32, MinGroupSize), APInt(32, MaxGroupSize + 1)));
+
+    if (AMDGPU::isEntryFunctionCC(F->getCallingConv()))
+      indicatePessimisticFixpoint();
   }
 
   ChangeStatus updateImpl(Attributor &A) override {
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
index 43928d7c2a09..2f1e7823f65c 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
@@ -652,8 +652,8 @@ bool AMDGPUCallLowering::lowerFormalArguments(
       ++PSInputNum;
 
       if (SkipArg) {
-        for (int I = 0, E = VRegs[Idx].size(); I != E; ++I)
-          B.buildUndef(VRegs[Idx][I]);
+        for (Register R : VRegs[Idx])
+          B.buildUndef(R);
 
         ++Idx;
         continue;
@@ -715,10 +715,9 @@ bool AMDGPUCallLowering::lowerFormalArguments(
   if (!MBB.empty())
     B.setInstr(*MBB.begin());
 
-  if (!IsEntryFunc) {
+  if (!IsEntryFunc && !IsGraphics) {
     // For the fixed ABI, pass workitem IDs in the last argument register.
-    if (AMDGPUTargetMachine::EnableFixedFunctionABI)
-      TLI.allocateSpecialInputVGPRsFixed(CCInfo, MF, *TRI, *Info);
+    TLI.allocateSpecialInputVGPRsFixed(CCInfo, MF, *TRI, *Info);
   }
 
   IncomingValueAssigner Assigner(AssignFn);
@@ -731,11 +730,6 @@ bool AMDGPUCallLowering::lowerFormalArguments(
 
   uint64_t StackOffset = Assigner.StackOffset;
 
-  if (!IsEntryFunc && !AMDGPUTargetMachine::EnableFixedFunctionABI) {
-    // Special inputs come after user arguments.
-    TLI.allocateSpecialInputVGPRs(CCInfo, MF, *TRI, *Info);
-  }
-
   // Start adding system SGPRs.
   if (IsEntryFunc) {
     TLI.allocateSystemSGPRs(CCInfo, MF, *Info, CC, IsGraphics);
@@ -829,9 +823,12 @@ bool AMDGPUCallLowering::passSpecialInputs(MachineIRBuilder &MIRBuilder,
 
     if (IncomingArg) {
       LI->loadInputValue(InputReg, MIRBuilder, IncomingArg, ArgRC, ArgTy);
-    } else {
-      assert(InputID == AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR);
+    } else if (InputID == AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR) {
       LI->getImplicitArgPtr(InputReg, MRI, MIRBuilder);
+    } else {
+      // We may have proven the input wasn't needed, although the ABI is
+      // requiring it. We just need to allocate the register appropriately.
+      MIRBuilder.buildUndef(InputReg);
     }
 
     if (OutgoingArg->isRegister()) {
@@ -1233,8 +1230,7 @@ bool AMDGPUCallLowering::lowerTailCall(
   // after the ordinary user argument registers.
   SmallVector<std::pair<MCRegister, Register>, 12> ImplicitArgRegs;
 
-  if (AMDGPUTargetMachine::EnableFixedFunctionABI &&
-      Info.CallConv != CallingConv::AMDGPU_Gfx) {
+  if (Info.CallConv != CallingConv::AMDGPU_Gfx) {
     // With a fixed ABI, allocate fixed registers before user arguments.
     if (!passSpecialInputs(MIRBuilder, CCInfo, ImplicitArgRegs, Info))
       return false;
@@ -1300,12 +1296,6 @@ bool AMDGPUCallLowering::lowerCall(MachineIRBuilder &MIRBuilder,
   const SITargetLowering &TLI = *getTLI<SITargetLowering>();
   const DataLayout &DL = F.getParent()->getDataLayout();
 
-  if (!AMDGPUTargetMachine::EnableFixedFunctionABI &&
-      Info.CallConv != CallingConv::AMDGPU_Gfx) {
-    LLVM_DEBUG(dbgs() << "Variable function ABI not implemented\n");
-    return false;
-  }
-
   SmallVector<ArgInfo, 8> OutArgs;
   for (auto &OrigArg : Info.OrigArgs)
     splitToValueTypes(OrigArg, OutArgs, DL, Info.CallConv);
@@ -1359,8 +1349,7 @@ bool AMDGPUCallLowering::lowerCall(MachineIRBuilder &MIRBuilder,
   // after the ordinary user argument registers.
   SmallVector<std::pair<MCRegister, Register>, 12> ImplicitArgRegs;
 
-  if (AMDGPUTargetMachine::EnableFixedFunctionABI &&
-      Info.CallConv != CallingConv::AMDGPU_Gfx) {
+  if (Info.CallConv != CallingConv::AMDGPU_Gfx) {
     // With a fixed ABI, allocate fixed registers before user arguments.
     if (!passSpecialInputs(MIRBuilder, CCInfo, ImplicitArgRegs, Info))
       return false;
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUCombine.td b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUCombine.td
index c7c5ff7bcbe7..2415fdfecaae 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUCombine.td
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUCombine.td
@@ -64,6 +64,30 @@ def int_minmax_to_med3 : GICombineRule<
          [{ return RegBankHelper.matchIntMinMaxToMed3(*${min_or_max}, ${matchinfo}); }]),
   (apply [{ RegBankHelper.applyMed3(*${min_or_max}, ${matchinfo}); }])>;
 
+def fp_minmax_to_med3 : GICombineRule<
+  (defs root:$min_or_max, med3_matchdata:$matchinfo),
+  (match (wip_match_opcode G_FMAXNUM,
+                           G_FMINNUM,
+                           G_FMAXNUM_IEEE,
+                           G_FMINNUM_IEEE):$min_or_max,
+         [{ return RegBankHelper.matchFPMinMaxToMed3(*${min_or_max}, ${matchinfo}); }]),
+  (apply [{ RegBankHelper.applyMed3(*${min_or_max}, ${matchinfo}); }])>;
+
+def fp_minmax_to_clamp : GICombineRule<
+  (defs root:$min_or_max, register_matchinfo:$matchinfo),
+  (match (wip_match_opcode G_FMAXNUM,
+                           G_FMINNUM,
+                           G_FMAXNUM_IEEE,
+                           G_FMINNUM_IEEE):$min_or_max,
+         [{ return RegBankHelper.matchFPMinMaxToClamp(*${min_or_max}, ${matchinfo}); }]),
+  (apply [{ RegBankHelper.applyClamp(*${min_or_max}, ${matchinfo}); }])>;
+
+def fmed3_intrinsic_to_clamp : GICombineRule<
+  (defs root:$fmed3, register_matchinfo:$matchinfo),
+  (match (wip_match_opcode G_INTRINSIC):$fmed3,
+         [{ return RegBankHelper.matchFPMed3ToClamp(*${fmed3}, ${matchinfo}); }]),
+  (apply [{ RegBankHelper.applyClamp(*${fmed3}, ${matchinfo}); }])>;
+
 def remove_fcanonicalize_matchinfo : GIDefMatchData<"Register">;
 
 def remove_fcanonicalize : GICombineRule<
@@ -102,7 +126,9 @@ def AMDGPUPostLegalizerCombinerHelper: GICombinerHelper<
 }
 
 def AMDGPURegBankCombinerHelper : GICombinerHelper<
-  "AMDGPUGenRegBankCombinerHelper", [zext_trunc_fold, int_minmax_to_med3, ptr_add_immed_chain]> {
+  "AMDGPUGenRegBankCombinerHelper",
+  [zext_trunc_fold, int_minmax_to_med3, ptr_add_immed_chain,
+   fp_minmax_to_clamp, fp_minmax_to_med3, fmed3_intrinsic_to_clamp]> {
   let DisableRuleOption = "amdgpuregbankcombiner-disable-rule";
   let StateClass = "AMDGPURegBankCombinerHelperState";
   let AdditionalArguments = [];
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUCombinerHelper.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUCombinerHelper.cpp
index 301e6f6d6f42..e79ff9b597c9 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUCombinerHelper.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUCombinerHelper.cpp
@@ -378,5 +378,4 @@ void AMDGPUCombinerHelper::applyFoldableFneg(MachineInstr &MI,
   }
 
   MI.eraseFromParent();
-  return;
 }
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUGISel.td b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUGISel.td
index 12cef2774aaf..7fd94a977be7 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUGISel.td
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUGISel.td
@@ -172,6 +172,8 @@ def : GINodeEquiv<G_AMDGPU_CVT_F32_UBYTE3, AMDGPUcvt_f32_ubyte3>;
 def : GINodeEquiv<G_AMDGPU_CVT_PK_I16_I32, AMDGPUpk_i16_i32_impl>;
 def : GINodeEquiv<G_AMDGPU_SMED3, AMDGPUsmed3>;
 def : GINodeEquiv<G_AMDGPU_UMED3, AMDGPUumed3>;
+def : GINodeEquiv<G_AMDGPU_FMED3, AMDGPUfmed3_impl>;
+def : GINodeEquiv<G_AMDGPU_CLAMP, AMDGPUclamp>;
 
 def : GINodeEquiv<G_AMDGPU_ATOMIC_CMPXCHG, AMDGPUatomic_cmp_swap>;
 def : GINodeEquiv<G_AMDGPU_BUFFER_LOAD, SIbuffer_load>;
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp
index b9c59f4c615a..699c6c479455 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp
@@ -280,11 +280,12 @@ void MetadataStreamerV2::emitKernelAttrs(const Function &Func) {
   }
 }
 
-void MetadataStreamerV2::emitKernelArgs(const Function &Func) {
+void MetadataStreamerV2::emitKernelArgs(const Function &Func,
+                                        const GCNSubtarget &ST) {
   for (auto &Arg : Func.args())
     emitKernelArg(Arg);
 
-  emitHiddenKernelArgs(Func);
+  emitHiddenKernelArgs(Func, ST);
 }
 
 void MetadataStreamerV2::emitKernelArg(const Argument &Arg) {
@@ -381,10 +382,9 @@ void MetadataStreamerV2::emitKernelArg(const DataLayout &DL, Type *Ty,
   }
 }
 
-void MetadataStreamerV2::emitHiddenKernelArgs(const Function &Func) {
-  int HiddenArgNumBytes =
-      getIntegerAttribute(Func, "amdgpu-implicitarg-num-bytes", 0);
-
+void MetadataStreamerV2::emitHiddenKernelArgs(const Function &Func,
+                                              const GCNSubtarget &ST) {
+  unsigned HiddenArgNumBytes = ST.getImplicitArgNumBytes(Func);
   if (!HiddenArgNumBytes)
     return;
 
@@ -465,11 +465,12 @@ void MetadataStreamerV2::emitKernel(const MachineFunction &MF,
   HSAMetadata.mKernels.push_back(Kernel::Metadata());
   auto &Kernel = HSAMetadata.mKernels.back();
 
+  const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
   Kernel.mName = std::string(Func.getName());
   Kernel.mSymbolName = (Twine(Func.getName()) + Twine("@kd")).str();
   emitKernelLanguage(Func);
   emitKernelAttrs(Func);
-  emitKernelArgs(Func);
+  emitKernelArgs(Func, ST);
   HSAMetadata.mKernels.back().mCodeProps = CodeProps;
   HSAMetadata.mKernels.back().mDebugProps = DebugProps;
 }
@@ -673,13 +674,14 @@ void MetadataStreamerV3::emitKernelAttrs(const Function &Func,
 }
 
 void MetadataStreamerV3::emitKernelArgs(const Function &Func,
+                                        const GCNSubtarget &ST,
                                         msgpack::MapDocNode Kern) {
   unsigned Offset = 0;
   auto Args = HSAMetadataDoc->getArrayNode();
   for (auto &Arg : Func.args())
     emitKernelArg(Arg, Offset, Args);
 
-  emitHiddenKernelArgs(Func, Offset, Args);
+  emitHiddenKernelArgs(Func, ST, Offset, Args);
 
   Kern[".args"] = Args;
 }
@@ -791,11 +793,10 @@ void MetadataStreamerV3::emitKernelArg(
 }
 
 void MetadataStreamerV3::emitHiddenKernelArgs(const Function &Func,
+                                              const GCNSubtarget &ST,
                                               unsigned &Offset,
                                               msgpack::ArrayDocNode Args) {
-  int HiddenArgNumBytes =
-      getIntegerAttribute(Func, "amdgpu-implicitarg-num-bytes", 0);
-
+  unsigned HiddenArgNumBytes = ST.getImplicitArgNumBytes(Func);
   if (!HiddenArgNumBytes)
     return;
 
@@ -912,6 +913,7 @@ void MetadataStreamerV3::emitKernel(const MachineFunction &MF,
                                     const SIProgramInfo &ProgramInfo) {
   auto &Func = MF.getFunction();
   auto Kern = getHSAKernelProps(MF, ProgramInfo);
+  const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
 
   assert(Func.getCallingConv() == CallingConv::AMDGPU_KERNEL ||
          Func.getCallingConv() == CallingConv::SPIR_KERNEL);
@@ -925,7 +927,7 @@ void MetadataStreamerV3::emitKernel(const MachineFunction &MF,
         (Twine(Func.getName()) + Twine(".kd")).str(), /*Copy=*/true);
     emitKernelLanguage(Func, Kern);
     emitKernelAttrs(Func, Kern);
-    emitKernelArgs(Func, Kern);
+    emitKernelArgs(Func, ST, Kern);
   }
 
   Kernels.push_back(Kern);
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.h b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.h
index af5dae1cd8c0..54ed0afbba6d 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.h
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.h
@@ -30,6 +30,7 @@ class MDNode;
 class Module;
 struct SIProgramInfo;
 class Type;
+class GCNSubtarget;
 
 namespace AMDGPU {
 
@@ -86,7 +87,8 @@ protected:
 
   void emitKernelAttrs(const Function &Func, msgpack::MapDocNode Kern);
 
-  void emitKernelArgs(const Function &Func, msgpack::MapDocNode Kern);
+  void emitKernelArgs(const Function &Func, const GCNSubtarget &ST,
+                      msgpack::MapDocNode Kern);
 
   void emitKernelArg(const Argument &Arg, unsigned &Offset,
                      msgpack::ArrayDocNode Args);
@@ -98,8 +100,8 @@ protected:
                      StringRef BaseTypeName = "", StringRef AccQual = "",
                      StringRef TypeQual = "");
 
-  void emitHiddenKernelArgs(const Function &Func, unsigned &Offset,
-                            msgpack::ArrayDocNode Args);
+  void emitHiddenKernelArgs(const Function &Func, const GCNSubtarget &ST,
+                            unsigned &Offset, msgpack::ArrayDocNode Args);
 
   msgpack::DocNode &getRootMetadata(StringRef Key) {
     return HSAMetadataDoc->getRoot().getMap(/*Convert=*/true)[Key];
@@ -173,7 +175,7 @@ private:
 
   void emitKernelAttrs(const Function &Func);
 
-  void emitKernelArgs(const Function &Func);
+  void emitKernelArgs(const Function &Func, const GCNSubtarget &ST);
 
   void emitKernelArg(const Argument &Arg);
 
@@ -183,7 +185,7 @@ private:
                      StringRef BaseTypeName = "", StringRef AccQual = "",
                      StringRef TypeQual = "");
 
-  void emitHiddenKernelArgs(const Function &Func);
+  void emitHiddenKernelArgs(const Function &Func, const GCNSubtarget &ST);
 
   const Metadata &getHSAMetadata() const {
     return HSAMetadata;
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
index 88b4ec53a2a0..db84b8766924 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
@@ -892,6 +892,15 @@ GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
     }
     break;
   }
+  case Intrinsic::amdgcn_is_shared:
+  case Intrinsic::amdgcn_is_private: {
+    if (isa<UndefValue>(II.getArgOperand(0)))
+      return IC.replaceInstUsesWith(II, UndefValue::get(II.getType()));
+
+    if (isa<ConstantPointerNull>(II.getArgOperand(0)))
+      return IC.replaceInstUsesWith(II, ConstantInt::getFalse(II.getType()));
+    break;
+  }
   default: {
     if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
             AMDGPU::getImageDimIntrinsicInfo(II.getIntrinsicID())) {
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
index 1f898f2ba8b3..5046daaed977 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -533,7 +533,7 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
       .legalFor({S32, S16, V2S16})
       .minScalar(0, S16)
-      .clampMaxNumElements(0, S16, 2)
+      .clampMaxNumElementsStrict(0, S16, 2)
       .widenScalarToNextMultipleOf(0, 32)
       .maxScalar(0, S32)
       .scalarize(0);
@@ -541,7 +541,7 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
     getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT, G_SADDSAT, G_SSUBSAT})
       .legalFor({S32, S16, V2S16}) // Clamp modifier
       .minScalarOrElt(0, S16)
-      .clampMaxNumElements(0, S16, 2)
+      .clampMaxNumElementsStrict(0, S16, 2)
       .scalarize(0)
       .widenScalarToNextPow2(0, 32)
       .lower();
@@ -712,7 +712,7 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
   }
 
   if (ST.hasVOP3PInsts())
-    FPOpActions.clampMaxNumElements(0, S16, 2);
+    FPOpActions.clampMaxNumElementsStrict(0, S16, 2);
 
   FPOpActions
     .scalarize(0)
@@ -728,7 +728,7 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
 
   getActionDefinitionsBuilder({G_FNEG, G_FABS})
     .legalFor(FPTypesPK16)
-    .clampMaxNumElements(0, S16, 2)
+    .clampMaxNumElementsStrict(0, S16, 2)
     .scalarize(0)
     .clampScalar(0, S16, S64);
 
@@ -965,7 +965,7 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
   if (ST.has16BitInsts()) {
     getActionDefinitionsBuilder(G_BSWAP)
       .legalFor({S16, S32, V2S16})
-      .clampMaxNumElements(0, S16, 2)
+      .clampMaxNumElementsStrict(0, S16, 2)
       // FIXME: Fixing non-power-of-2 before clamp is workaround for
       // narrowScalar limitation.
       .widenScalarToNextPow2(0)
@@ -1052,10 +1052,6 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
 
     // Split vector extloads.
     unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
-    unsigned AlignBits = Query.MMODescrs[0].AlignInBits;
-
-    if (MemSize < DstTy.getSizeInBits())
-      MemSize = std::max(MemSize, AlignBits);
 
     if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize)
       return true;
@@ -1077,12 +1073,6 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
         return true;
     }
 
-    if (AlignBits < MemSize) {
-      const SITargetLowering *TLI = ST.getTargetLowering();
-      return !TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS,
-                                                      Align(AlignBits / 8));
-    }
-
     return false;
   };
 
@@ -1176,20 +1166,6 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
               if (DstSize > MemSize)
                 return std::make_pair(0, LLT::scalar(MemSize));
 
-              if (!isPowerOf2_32(DstSize)) {
-                // We're probably decomposing an odd sized store. Try to split
-                // to the widest type. TODO: Account for alignment. As-is it
-                // should be OK, since the new parts will be further legalized.
-                unsigned FloorSize = PowerOf2Floor(DstSize);
-                return std::make_pair(0, LLT::scalar(FloorSize));
-              }
-
-              if (DstSize > 32 && (DstSize % 32 != 0)) {
-                // FIXME: Need a way to specify non-extload of larger size if
-                // suitably aligned.
-                return std::make_pair(0, LLT::scalar(32 * (DstSize / 32)));
-              }
-
               unsigned MaxSize = maxSizeForAddrSpace(ST,
                                                      PtrTy.getAddressSpace(),
                                                      Op == G_LOAD);
@@ -1257,14 +1233,6 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
                            ElementCount::getFixed(FloorSize / EltSize), EltTy));
               }
 
-              // Need to split because of alignment.
-              unsigned Align = Query.MMODescrs[0].AlignInBits;
-              if (EltSize > Align &&
-                  (EltSize / Align < DstTy.getNumElements())) {
-                return std::make_pair(
-                    0, LLT::fixed_vector(EltSize / Align, EltTy));
-              }
-
               // May need relegalization for the scalars.
               return std::make_pair(0, EltTy);
             })
@@ -1457,6 +1425,13 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
     // FIXME: Doesn't handle extract of illegal sizes.
     getActionDefinitionsBuilder(Op)
       .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32)))
+      .lowerIf([=](const LegalityQuery &Query) {
+          // Sub-vector(or single element) insert and extract.
+          // TODO: verify immediate offset here since lower only works with
+          // whole elements.
+          const LLT BigTy = Query.Types[BigTyIdx];
+          return BigTy.isVector();
+        })
       // FIXME: Multiples of 16 should not be legal.
       .legalIf([=](const LegalityQuery &Query) {
           const LLT BigTy = Query.Types[BigTyIdx];
@@ -1615,7 +1590,7 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
       // Prefer to reduce vector widths for 16-bit vectors before lowering, to
       // get more vector shift opportunities, since we'll get those when
       // expanded.
-      .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16));
+      .clampMaxNumElementsStrict(0, S16, 2);
   } else if (ST.has16BitInsts()) {
     SextInReg.lowerFor({{S32}, {S64}, {S16}});
   } else {
@@ -1637,14 +1612,14 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
   getActionDefinitionsBuilder(G_FSHR)
     .legalFor({{S32, S32}})
     .lowerFor({{V2S16, V2S16}})
-    .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16))
+    .clampMaxNumElementsStrict(0, S16, 2)
     .scalarize(0)
     .lower();
 
   if (ST.hasVOP3PInsts()) {
     getActionDefinitionsBuilder(G_FSHL)
       .lowerFor({{V2S16, V2S16}})
-      .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16))
+      .clampMaxNumElementsStrict(0, S16, 2)
       .scalarize(0)
       .lower();
   } else {
@@ -2567,10 +2542,8 @@ bool AMDGPULegalizerInfo::legalizeLoad(LegalizerHelper &Helper,
       } else {
         // For cases where the widened type isn't a nice register value, unmerge
         // from a widened register (e.g. <3 x s16> -> <4 x s16>)
-        B.setInsertPt(B.getMBB(), ++B.getInsertPt());
-        WideLoad = Helper.widenWithUnmerge(WideTy, ValReg);
-        B.setInsertPt(B.getMBB(), MI.getIterator());
-        B.buildLoadFromOffset(WideLoad, PtrReg, *MMO, 0);
+        WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
+        B.buildDeleteTrailingVectorElements(ValReg, WideLoad);
       }
     }
 
@@ -3843,6 +3816,10 @@ Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B,
     llvm_unreachable("invalid data type");
   }
 
+  if (StoreVT == LLT::fixed_vector(3, S16)) {
+    Reg = B.buildPadVectorWithUndefElements(LLT::fixed_vector(4, S16), Reg)
+              .getReg(0);
+  }
   return Reg;
 }
 
@@ -4237,8 +4214,17 @@ static void packImage16bitOpsToDwords(MachineIRBuilder &B, MachineInstr &MI,
         (I >= Intr->GradientStart && I < Intr->CoordStart && !IsG16) ||
         (I >= Intr->CoordStart && !IsA16)) {
       // Handle any gradient or coordinate operands that should not be packed
-      AddrReg = B.buildBitcast(V2S16, AddrReg).getReg(0);
-      PackedAddrs.push_back(AddrReg);
+      if ((I < Intr->GradientStart) && IsA16 &&
+          (B.getMRI()->getType(AddrReg) == S16)) {
+        // Special handling of bias when A16 is on. Bias is of type half but
+        // occupies full 32-bit.
+        PackedAddrs.push_back(
+            B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)})
+                .getReg(0));
+      } else {
+        AddrReg = B.buildBitcast(V2S16, AddrReg).getReg(0);
+        PackedAddrs.push_back(AddrReg);
+      }
     } else {
       // Dz/dh, dz/dv and the last odd coord are packed with undef. Also, in 1D,
       // derivatives dx/dh and dx/dv are packed with undef.
@@ -4676,9 +4662,23 @@ bool AMDGPULegalizerInfo::legalizeImageIntrinsic(
   // Deal with the one annoying legal case.
   const LLT V3S16 = LLT::fixed_vector(3, 16);
   if (Ty == V3S16) {
-    padWithUndef(ResTy, RegsToCover - ResultRegs.size() + 1);
-    auto Concat = B.buildConcatVectors(LLT::fixed_vector(6, 16), ResultRegs);
-    B.buildUnmerge({DstReg, MRI->createGenericVirtualRegister(V3S16)}, Concat);
+    if (IsTFE) {
+      if (ResultRegs.size() == 1) {
+        NewResultReg = ResultRegs[0];
+      } else if (ResultRegs.size() == 2) {
+        LLT V4S16 = LLT::fixed_vector(4, 16);
+        NewResultReg = B.buildConcatVectors(V4S16, ResultRegs).getReg(0);
+      } else {
+        return false;
+      }
+    }
+
+    if (MRI->getType(DstReg).getNumElements() <
+        MRI->getType(NewResultReg).getNumElements()) {
+      B.buildDeleteTrailingVectorElements(DstReg, NewResultReg);
+    } else {
+      B.buildPadVectorWithUndefElements(DstReg, NewResultReg);
+    }
     return true;
   }
 
@@ -4869,8 +4869,8 @@ bool AMDGPULegalizerInfo::legalizeBVHIntrinsic(MachineInstr &MI,
   }
   Ops.push_back(RayExtent);
 
-  auto packLanes = [&Ops, &S32, &B] (Register Src) {
-    auto Unmerge = B.buildUnmerge({S32, S32, S32, S32}, Src);
+  auto packLanes = [&Ops, &S32, &B](Register Src) {
+    auto Unmerge = B.buildUnmerge({S32, S32, S32}, Src);
     Ops.push_back(Unmerge.getReg(0));
     Ops.push_back(Unmerge.getReg(1));
     Ops.push_back(Unmerge.getReg(2));
@@ -4878,8 +4878,8 @@ bool AMDGPULegalizerInfo::legalizeBVHIntrinsic(MachineInstr &MI,
 
   packLanes(RayOrigin);
   if (IsA16) {
-    auto UnmergeRayDir = B.buildUnmerge({S16, S16, S16, S16}, RayDir);
-    auto UnmergeRayInvDir = B.buildUnmerge({S16, S16, S16, S16}, RayInvDir);
+    auto UnmergeRayDir = B.buildUnmerge({S16, S16, S16}, RayDir);
+    auto UnmergeRayInvDir = B.buildUnmerge({S16, S16, S16}, RayInvDir);
     Register R1 = MRI.createGenericVirtualRegister(S32);
     Register R2 = MRI.createGenericVirtualRegister(S32);
     Register R3 = MRI.createGenericVirtualRegister(S32);
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp
index 12d6d35a6917..6e2b5dc471bc 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp
@@ -24,13 +24,6 @@
 // A possible future refinement is to specialise the structure per-kernel, so
 // that fields can be elided based on more expensive analysis.
 //
-// NOTE: Since this pass will directly pack LDS (assume large LDS) into a struct
-// type which would cause allocating huge memory for struct instance within
-// every kernel. Hence, before running this pass, it is advisable to run the
-// pass "amdgpu-replace-lds-use-with-pointer" which will replace LDS uses within
-// non-kernel functions by pointers and thereby minimizes the unnecessary per
-// kernel allocation of LDS memory.
-//
 //===----------------------------------------------------------------------===//
 
 #include "AMDGPU.h"
@@ -62,6 +55,20 @@ static cl::opt<bool> SuperAlignLDSGlobals(
 
 namespace {
 
+SmallPtrSet<GlobalValue *, 32> getUsedList(Module &M) {
+  SmallPtrSet<GlobalValue *, 32> UsedList;
+
+  SmallVector<GlobalValue *, 32> TmpVec;
+  collectUsedGlobalVariables(M, TmpVec, true);
+  UsedList.insert(TmpVec.begin(), TmpVec.end());
+
+  TmpVec.clear();
+  collectUsedGlobalVariables(M, TmpVec, false);
+  UsedList.insert(TmpVec.begin(), TmpVec.end());
+
+  return UsedList;
+}
+
 class AMDGPULowerModuleLDS : public ModulePass {
 
   static void removeFromUsedList(Module &M, StringRef Name,
@@ -105,11 +112,9 @@ class AMDGPULowerModuleLDS : public ModulePass {
   removeFromUsedLists(Module &M,
                       const std::vector<GlobalVariable *> &LocalVars) {
     SmallPtrSet<Constant *, 32> LocalVarsSet;
-    for (size_t I = 0; I < LocalVars.size(); I++) {
-      if (Constant *C = dyn_cast<Constant>(LocalVars[I]->stripPointerCasts())) {
+    for (GlobalVariable *LocalVar : LocalVars)
+      if (Constant *C = dyn_cast<Constant>(LocalVar->stripPointerCasts()))
         LocalVarsSet.insert(C);
-      }
-    }
     removeFromUsedList(M, "llvm.used", LocalVarsSet);
     removeFromUsedList(M, "llvm.compiler.used", LocalVarsSet);
   }
@@ -158,9 +163,9 @@ public:
   }
 
   bool runOnModule(Module &M) override {
-    UsedList = AMDGPU::getUsedList(M);
-
-    bool Changed = processUsedLDS(M);
+    UsedList = getUsedList(M);
+    bool Changed = superAlignLDSGlobals(M);
+    Changed |= processUsedLDS(M);
 
     for (Function &F : M.functions()) {
       if (F.isDeclaration())
@@ -177,6 +182,50 @@ public:
   }
 
 private:
+  // Increase the alignment of LDS globals if necessary to maximise the chance
+  // that we can use aligned LDS instructions to access them.
+  static bool superAlignLDSGlobals(Module &M) {
+    const DataLayout &DL = M.getDataLayout();
+    bool Changed = false;
+    if (!SuperAlignLDSGlobals) {
+      return Changed;
+    }
+
+    for (auto &GV : M.globals()) {
+      if (GV.getType()->getPointerAddressSpace() != AMDGPUAS::LOCAL_ADDRESS) {
+        // Only changing alignment of LDS variables
+        continue;
+      }
+      if (!GV.hasInitializer()) {
+        // cuda/hip extern __shared__ variable, leave alignment alone
+        continue;
+      }
+
+      Align Alignment = AMDGPU::getAlign(DL, &GV);
+      TypeSize GVSize = DL.getTypeAllocSize(GV.getValueType());
+
+      if (GVSize > 8) {
+        // We might want to use a b96 or b128 load/store
+        Alignment = std::max(Alignment, Align(16));
+      } else if (GVSize > 4) {
+        // We might want to use a b64 load/store
+        Alignment = std::max(Alignment, Align(8));
+      } else if (GVSize > 2) {
+        // We might want to use a b32 load/store
+        Alignment = std::max(Alignment, Align(4));
+      } else if (GVSize > 1) {
+        // We might want to use a b16 load/store
+        Alignment = std::max(Alignment, Align(2));
+      }
+
+      if (Alignment != AMDGPU::getAlign(DL, &GV)) {
+        Changed = true;
+        GV.setAlignment(Alignment);
+      }
+    }
+    return Changed;
+  }
+
   bool processUsedLDS(Module &M, Function *F = nullptr) {
     LLVMContext &Ctx = M.getContext();
     const DataLayout &DL = M.getDataLayout();
@@ -190,31 +239,6 @@ private:
       return false;
     }
 
-    // Increase the alignment of LDS globals if necessary to maximise the chance
-    // that we can use aligned LDS instructions to access them.
-    if (SuperAlignLDSGlobals) {
-      for (auto *GV : FoundLocalVars) {
-        Align Alignment = AMDGPU::getAlign(DL, GV);
-        TypeSize GVSize = DL.getTypeAllocSize(GV->getValueType());
-
-        if (GVSize > 8) {
-          // We might want to use a b96 or b128 load/store
-          Alignment = std::max(Alignment, Align(16));
-        } else if (GVSize > 4) {
-          // We might want to use a b64 load/store
-          Alignment = std::max(Alignment, Align(8));
-        } else if (GVSize > 2) {
-          // We might want to use a b32 load/store
-          Alignment = std::max(Alignment, Align(4));
-        } else if (GVSize > 1) {
-          // We might want to use a b16 load/store
-          Alignment = std::max(Alignment, Align(2));
-        }
-
-        GV->setAlignment(Alignment);
-      }
-    }
-
     SmallVector<OptimizedStructLayoutField, 8> LayoutFields;
     LayoutFields.reserve(FoundLocalVars.size());
     for (GlobalVariable *GV : FoundLocalVars) {
@@ -343,20 +367,14 @@ private:
       refineUsesAlignmentAndAA(GEP, A, DL, AliasScope, NoAlias);
     }
 
-    // Mark kernels with asm that reads the address of the allocated structure
-    // This is not necessary for lowering. This lets other passes, specifically
-    // PromoteAlloca, accurately calculate how much LDS will be used by the
-    // kernel after lowering.
+    // This ensures the variable is allocated when called functions access it.
+    // It also lets other passes, specifically PromoteAlloca, accurately
+    // calculate how much LDS will be used by the kernel after lowering.
     if (!F) {
       IRBuilder<> Builder(Ctx);
-      SmallPtrSet<Function *, 32> Kernels;
       for (Function &Func : M.functions()) {
-        if (Func.isDeclaration())
-          continue;
-
-        if (AMDGPU::isKernelCC(&Func) && !Kernels.contains(&Func)) {
+        if (!Func.isDeclaration() && AMDGPU::isKernelCC(&Func)) {
           markUsedByKernel(Builder, &Func, SGV);
-          Kernels.insert(&Func);
         }
       }
     }
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUMachineCFGStructurizer.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUMachineCFGStructurizer.cpp
index 5d4b007f11e6..4e2f98d2a5db 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUMachineCFGStructurizer.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUMachineCFGStructurizer.cpp
@@ -2786,12 +2786,8 @@ AMDGPUMachineCFGStructurizer::initializeSelectRegisters(MRT *MRT, unsigned Selec
     // Fixme: Move linearization creation to the original spot
     createLinearizedRegion(Region, SelectOut);
 
-    for (auto CI = Region->getChildren()->begin(),
-              CE = Region->getChildren()->end();
-         CI != CE; ++CI) {
-      InnerSelectOut =
-          initializeSelectRegisters((*CI), InnerSelectOut, MRI, TII);
-    }
+    for (auto *CI : *Region->getChildren())
+      InnerSelectOut = initializeSelectRegisters(CI, InnerSelectOut, MRI, TII);
     MRT->setBBSelectRegIn(InnerSelectOut);
     return InnerSelectOut;
   } else {
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.cpp
index 2aa02299ecdc..8ad344816ad2 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.cpp
@@ -119,31 +119,27 @@ private:
   bool isConstantAddr(const Value *V) const;
 };
 
-static const Value *getMemoryInstrPtr(const Instruction *Inst) {
-  if (auto LI = dyn_cast<LoadInst>(Inst)) {
-    return LI->getPointerOperand();
-  }
-  if (auto SI = dyn_cast<StoreInst>(Inst)) {
-    return SI->getPointerOperand();
-  }
-  if (auto AI = dyn_cast<AtomicCmpXchgInst>(Inst)) {
-    return AI->getPointerOperand();
-  }
-  if (auto AI = dyn_cast<AtomicRMWInst>(Inst)) {
-    return AI->getPointerOperand();
-  }
-  if (auto MI = dyn_cast<AnyMemIntrinsic>(Inst)) {
-    return MI->getRawDest();
-  }
-
-  return nullptr;
+static std::pair<const Value *, const Type *> getMemoryInstrPtrAndType(
+    const Instruction *Inst) {
+  if (auto LI = dyn_cast<LoadInst>(Inst))
+    return {LI->getPointerOperand(), LI->getType()};
+  if (auto SI = dyn_cast<StoreInst>(Inst))
+    return {SI->getPointerOperand(), SI->getValueOperand()->getType()};
+  if (auto AI = dyn_cast<AtomicCmpXchgInst>(Inst))
+    return {AI->getPointerOperand(), AI->getCompareOperand()->getType()};
+  if (auto AI = dyn_cast<AtomicRMWInst>(Inst))
+    return {AI->getPointerOperand(), AI->getValOperand()->getType()};
+  if (auto MI = dyn_cast<AnyMemIntrinsic>(Inst))
+    return {MI->getRawDest(), Type::getInt8Ty(MI->getContext())};
+
+  return {nullptr, nullptr};
 }
 
 bool AMDGPUPerfHint::isIndirectAccess(const Instruction *Inst) const {
   LLVM_DEBUG(dbgs() << "[isIndirectAccess] " << *Inst << '\n');
   SmallSet<const Value *, 32> WorkSet;
   SmallSet<const Value *, 32> Visited;
-  if (const Value *MO = getMemoryInstrPtr(Inst)) {
+  if (const Value *MO = getMemoryInstrPtrAndType(Inst).first) {
     if (isGlobalAddr(MO))
       WorkSet.insert(MO);
   }
@@ -209,10 +205,8 @@ AMDGPUPerfHintAnalysis::FuncInfo *AMDGPUPerfHint::visit(const Function &F) {
   for (auto &B : F) {
     LastAccess = MemAccessInfo();
     for (auto &I : B) {
-      if (const Value *Ptr = getMemoryInstrPtr(&I)) {
-        unsigned Size = divideCeil(
-            Ptr->getType()->getPointerElementType()->getPrimitiveSizeInBits(),
-            32);
+      if (const Type *Ty = getMemoryInstrPtrAndType(&I).second) {
+        unsigned Size = divideCeil(Ty->getPrimitiveSizeInBits(), 32);
         if (isIndirectAccess(&I))
           FI.IAMInstCost += Size;
         if (isLargeStride(&I))
@@ -326,7 +320,7 @@ bool AMDGPUPerfHint::isLargeStride(const Instruction *Inst) {
 AMDGPUPerfHint::MemAccessInfo
 AMDGPUPerfHint::makeMemAccessInfo(Instruction *Inst) const {
   MemAccessInfo MAI;
-  const Value *MO = getMemoryInstrPtr(Inst);
+  const Value *MO = getMemoryInstrPtrAndType(Inst).first;
 
   LLVM_DEBUG(dbgs() << "[isLargeStride] MO: " << *MO << '\n');
   // Do not treat local-addr memory access as large stride.
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
index 3ec5dd7e0eff..f9a9fe403ff6 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
@@ -939,7 +939,7 @@ bool AMDGPUPromoteAllocaImpl::handleAlloca(AllocaInst &I, bool SufficientLDS) {
       GlobalVariable::NotThreadLocal,
       AMDGPUAS::LOCAL_ADDRESS);
   GV->setUnnamedAddr(GlobalValue::UnnamedAddr::Global);
-  GV->setAlignment(MaybeAlign(I.getAlignment()));
+  GV->setAlignment(I.getAlign());
 
   Value *TCntY, *TCntZ;
 
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp
index 12b5830ef930..3ce67a733c10 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp
@@ -16,6 +16,7 @@
 #include "AMDGPURegisterBankInfo.h"
 #include "GCNSubtarget.h"
 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
+#include "SIMachineFunctionInfo.h"
 #include "llvm/CodeGen/GlobalISel/Combiner.h"
 #include "llvm/CodeGen/GlobalISel/CombinerHelper.h"
 #include "llvm/CodeGen/GlobalISel/CombinerInfo.h"
@@ -23,6 +24,7 @@
 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
 #include "llvm/CodeGen/MachineDominators.h"
 #include "llvm/CodeGen/TargetPassConfig.h"
+#include "llvm/IR/IntrinsicsAMDGPU.h"
 #include "llvm/Target/TargetMachine.h"
 #define DEBUG_TYPE "amdgpu-regbank-combiner"
 
@@ -36,13 +38,15 @@ protected:
   MachineRegisterInfo &MRI;
   const RegisterBankInfo &RBI;
   const TargetRegisterInfo &TRI;
+  const SIInstrInfo &TII;
   CombinerHelper &Helper;
 
 public:
   AMDGPURegBankCombinerHelper(MachineIRBuilder &B, CombinerHelper &Helper)
       : B(B), MF(B.getMF()), MRI(*B.getMRI()),
         RBI(*MF.getSubtarget().getRegBankInfo()),
-        TRI(*MF.getSubtarget().getRegisterInfo()), Helper(Helper){};
+        TRI(*MF.getSubtarget().getRegisterInfo()),
+        TII(*MF.getSubtarget<GCNSubtarget>().getInstrInfo()), Helper(Helper){};
 
   bool isVgprRegBank(Register Reg);
   Register getAsVgpr(Register Reg);
@@ -63,7 +67,19 @@ public:
                 Register &Val, CstTy &K0, CstTy &K1);
 
   bool matchIntMinMaxToMed3(MachineInstr &MI, Med3MatchInfo &MatchInfo);
+  bool matchFPMinMaxToMed3(MachineInstr &MI, Med3MatchInfo &MatchInfo);
+  bool matchFPMinMaxToClamp(MachineInstr &MI, Register &Reg);
+  bool matchFPMed3ToClamp(MachineInstr &MI, Register &Reg);
   void applyMed3(MachineInstr &MI, Med3MatchInfo &MatchInfo);
+  void applyClamp(MachineInstr &MI, Register &Reg);
+
+private:
+  AMDGPU::SIModeRegisterDefaults getMode();
+  bool getIEEE();
+  bool getDX10Clamp();
+  bool isFminnumIeee(const MachineInstr &MI);
+  bool isFCst(MachineInstr *MI);
+  bool isClampZeroToOne(MachineInstr *K0, MachineInstr *K1);
 };
 
 bool AMDGPURegBankCombinerHelper::isVgprRegBank(Register Reg) {
@@ -98,6 +114,13 @@ AMDGPURegBankCombinerHelper::getMinMaxPair(unsigned Opc) {
   case AMDGPU::G_UMAX:
   case AMDGPU::G_UMIN:
     return {AMDGPU::G_UMIN, AMDGPU::G_UMAX, AMDGPU::G_AMDGPU_UMED3};
+  case AMDGPU::G_FMAXNUM:
+  case AMDGPU::G_FMINNUM:
+    return {AMDGPU::G_FMINNUM, AMDGPU::G_FMAXNUM, AMDGPU::G_AMDGPU_FMED3};
+  case AMDGPU::G_FMAXNUM_IEEE:
+  case AMDGPU::G_FMINNUM_IEEE:
+    return {AMDGPU::G_FMINNUM_IEEE, AMDGPU::G_FMAXNUM_IEEE,
+            AMDGPU::G_AMDGPU_FMED3};
   }
 }
 
@@ -148,6 +171,146 @@ bool AMDGPURegBankCombinerHelper::matchIntMinMaxToMed3(
   return true;
 }
 
+// fmed3(NaN, K0, K1) = min(min(NaN, K0), K1)
+// ieee = true  : min/max(SNaN, K) = QNaN, min/max(QNaN, K) = K
+// ieee = false : min/max(NaN, K) = K
+// clamp(NaN) = dx10_clamp ? 0.0 : NaN
+// Consider values of min(max(Val, K0), K1) and max(min(Val, K1), K0) as input.
+// Other operand commutes (see matchMed) give same result since min and max are
+// commutative.
+
+// Try to replace fp min(max(Val, K0), K1) or max(min(Val, K1), K0), KO<=K1
+// with fmed3(Val, K0, K1) or clamp(Val). Clamp requires K0 = 0.0 and K1 = 1.0.
+// Val = SNaN only for ieee = true
+// fmed3(SNaN, K0, K1) = min(min(SNaN, K0), K1) = min(QNaN, K1) = K1
+// min(max(SNaN, K0), K1) = min(QNaN, K1) = K1
+// max(min(SNaN, K1), K0) = max(K1, K0) = K1
+// Val = NaN,ieee = false or Val = QNaN,ieee = true
+// fmed3(NaN, K0, K1) = min(min(NaN, K0), K1) = min(K0, K1) = K0
+// min(max(NaN, K0), K1) = min(K0, K1) = K0 (can clamp when dx10_clamp = true)
+// max(min(NaN, K1), K0) = max(K1, K0) = K1 != K0
+bool AMDGPURegBankCombinerHelper::matchFPMinMaxToMed3(
+    MachineInstr &MI, Med3MatchInfo &MatchInfo) {
+  Register Dst = MI.getOperand(0).getReg();
+  LLT Ty = MRI.getType(Dst);
+  if (Ty != LLT::scalar(16) && Ty != LLT::scalar(32))
+    return false;
+
+  auto OpcodeTriple = getMinMaxPair(MI.getOpcode());
+
+  Register Val;
+  Optional<FPValueAndVReg> K0, K1;
+  // Match min(max(Val, K0), K1) or max(min(Val, K1), K0). Then see if K0 <= K1.
+  if (!matchMed<GFCstAndRegMatch>(MI, MRI, OpcodeTriple, Val, K0, K1))
+    return false;
+
+  if (K0->Value > K1->Value)
+    return false;
+
+  // For IEEE=false perform combine only when it's safe to assume that there are
+  // no NaN inputs. Most often MI is marked with nnan fast math flag.
+  // For IEEE=true consider NaN inputs. fmed3(NaN, K0, K1) is equivalent to
+  // min(min(NaN, K0), K1). Safe to fold for min(max(Val, K0), K1) since inner
+  // nodes(max/min) have same behavior when one input is NaN and other isn't.
+  // Don't consider max(min(SNaN, K1), K0) since there is no isKnownNeverQNaN,
+  // also post-legalizer inputs to min/max are fcanonicalized (never SNaN).
+  if ((getIEEE() && isFminnumIeee(MI)) || isKnownNeverNaN(Dst, MRI)) {
+    // Don't fold single use constant that can't be inlined.
+    if ((!MRI.hasOneNonDBGUse(K0->VReg) || TII.isInlineConstant(K0->Value)) &&
+        (!MRI.hasOneNonDBGUse(K1->VReg) || TII.isInlineConstant(K1->Value))) {
+      MatchInfo = {OpcodeTriple.Med, Val, K0->VReg, K1->VReg};
+      return true;
+    }
+  }
+
+  return false;
+}
+
+bool AMDGPURegBankCombinerHelper::matchFPMinMaxToClamp(MachineInstr &MI,
+                                                       Register &Reg) {
+  // Clamp is available on all types after regbankselect (f16, f32, f64, v2f16).
+  auto OpcodeTriple = getMinMaxPair(MI.getOpcode());
+  Register Val;
+  Optional<FPValueAndVReg> K0, K1;
+  // Match min(max(Val, K0), K1) or max(min(Val, K1), K0).
+  if (!matchMed<GFCstOrSplatGFCstMatch>(MI, MRI, OpcodeTriple, Val, K0, K1))
+    return false;
+
+  if (!K0->Value.isExactlyValue(0.0) || !K1->Value.isExactlyValue(1.0))
+    return false;
+
+  // For IEEE=false perform combine only when it's safe to assume that there are
+  // no NaN inputs. Most often MI is marked with nnan fast math flag.
+  // For IEEE=true consider NaN inputs. Only min(max(QNaN, 0.0), 1.0) evaluates
+  // to 0.0 requires dx10_clamp = true.
+  if ((getIEEE() && getDX10Clamp() && isFminnumIeee(MI) &&
+       isKnownNeverSNaN(Val, MRI)) ||
+      isKnownNeverNaN(MI.getOperand(0).getReg(), MRI)) {
+    Reg = Val;
+    return true;
+  }
+
+  return false;
+}
+
+// Replacing fmed3(NaN, 0.0, 1.0) with clamp. Requires dx10_clamp = true.
+// Val = SNaN only for ieee = true. It is important which operand is NaN.
+// min(min(SNaN, 0.0), 1.0) = min(QNaN, 1.0) = 1.0
+// min(min(SNaN, 1.0), 0.0) = min(QNaN, 0.0) = 0.0
+// min(min(0.0, 1.0), SNaN) = min(0.0, SNaN) = QNaN
+// Val = NaN,ieee = false or Val = QNaN,ieee = true
+// min(min(NaN, 0.0), 1.0) = min(0.0, 1.0) = 0.0
+// min(min(NaN, 1.0), 0.0) = min(1.0, 0.0) = 0.0
+// min(min(0.0, 1.0), NaN) = min(0.0, NaN) = 0.0
+bool AMDGPURegBankCombinerHelper::matchFPMed3ToClamp(MachineInstr &MI,
+                                                     Register &Reg) {
+  if (MI.getIntrinsicID() != Intrinsic::amdgcn_fmed3)
+    return false;
+
+  // In llvm-ir, clamp is often represented as an intrinsic call to
+  // @llvm.amdgcn.fmed3.f32(%Val, 0.0, 1.0). Check for other operand orders.
+  MachineInstr *Src0 = getDefIgnoringCopies(MI.getOperand(2).getReg(), MRI);
+  MachineInstr *Src1 = getDefIgnoringCopies(MI.getOperand(3).getReg(), MRI);
+  MachineInstr *Src2 = getDefIgnoringCopies(MI.getOperand(4).getReg(), MRI);
+
+  if (isFCst(Src0) && !isFCst(Src1))
+    std::swap(Src0, Src1);
+  if (isFCst(Src1) && !isFCst(Src2))
+    std::swap(Src1, Src2);
+  if (isFCst(Src0) && !isFCst(Src1))
+    std::swap(Src0, Src1);
+  if (!isClampZeroToOne(Src1, Src2))
+    return false;
+
+  Register Val = Src0->getOperand(0).getReg();
+
+  auto isOp3Zero = [&]() {
+    MachineInstr *Op3 = getDefIgnoringCopies(MI.getOperand(4).getReg(), MRI);
+    if (Op3->getOpcode() == TargetOpcode::G_FCONSTANT)
+      return Op3->getOperand(1).getFPImm()->isExactlyValue(0.0);
+    return false;
+  };
+  // For IEEE=false perform combine only when it's safe to assume that there are
+  // no NaN inputs. Most often MI is marked with nnan fast math flag.
+  // For IEEE=true consider NaN inputs. Requires dx10_clamp = true. Safe to fold
+  // when Val could be QNaN. If Val can also be SNaN third input should be 0.0.
+  if (isKnownNeverNaN(MI.getOperand(0).getReg(), MRI) ||
+      (getIEEE() && getDX10Clamp() &&
+       (isKnownNeverSNaN(Val, MRI) || isOp3Zero()))) {
+    Reg = Val;
+    return true;
+  }
+
+  return false;
+}
+
+void AMDGPURegBankCombinerHelper::applyClamp(MachineInstr &MI, Register &Reg) {
+  B.setInstrAndDebugLoc(MI);
+  B.buildInstr(AMDGPU::G_AMDGPU_CLAMP, {MI.getOperand(0)}, {Reg},
+               MI.getFlags());
+  MI.eraseFromParent();
+}
+
 void AMDGPURegBankCombinerHelper::applyMed3(MachineInstr &MI,
                                             Med3MatchInfo &MatchInfo) {
   B.setInstrAndDebugLoc(MI);
@@ -158,6 +321,33 @@ void AMDGPURegBankCombinerHelper::applyMed3(MachineInstr &MI,
   MI.eraseFromParent();
 }
 
+AMDGPU::SIModeRegisterDefaults AMDGPURegBankCombinerHelper::getMode() {
+  return MF.getInfo<SIMachineFunctionInfo>()->getMode();
+}
+
+bool AMDGPURegBankCombinerHelper::getIEEE() { return getMode().IEEE; }
+
+bool AMDGPURegBankCombinerHelper::getDX10Clamp() { return getMode().DX10Clamp; }
+
+bool AMDGPURegBankCombinerHelper::isFminnumIeee(const MachineInstr &MI) {
+  return MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE;
+}
+
+bool AMDGPURegBankCombinerHelper::isFCst(MachineInstr *MI) {
+  return MI->getOpcode() == AMDGPU::G_FCONSTANT;
+}
+
+bool AMDGPURegBankCombinerHelper::isClampZeroToOne(MachineInstr *K0,
+                                                   MachineInstr *K1) {
+  if (isFCst(K0) && isFCst(K1)) {
+    const ConstantFP *KO_FPImm = K0->getOperand(1).getFPImm();
+    const ConstantFP *K1_FPImm = K1->getOperand(1).getFPImm();
+    return (KO_FPImm->isExactlyValue(0.0) && K1_FPImm->isExactlyValue(1.0)) ||
+           (KO_FPImm->isExactlyValue(1.0) && K1_FPImm->isExactlyValue(0.0));
+  }
+  return false;
+}
+
 class AMDGPURegBankCombinerHelperState {
 protected:
   CombinerHelper &Helper;
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
index 5988403c0a29..c60012bcfe2e 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -707,9 +707,6 @@ bool AMDGPURegisterBankInfo::executeInWaterfallLoop(
   iterator_range<MachineBasicBlock::iterator> Range,
   SmallSet<Register, 4> &SGPROperandRegs,
   MachineRegisterInfo &MRI) const {
-  SmallVector<Register, 4> ResultRegs;
-  SmallVector<Register, 4> InitResultRegs;
-  SmallVector<Register, 4> PhiRegs;
 
   // Track use registers which have already been expanded with a readfirstlane
   // sequence. This may have multiple uses if moving a sequence.
@@ -774,15 +771,6 @@ bool AMDGPURegisterBankInfo::executeInWaterfallLoop(
     .addReg(NewExec)
     .addMBB(LoopBB);
 
-  for (auto Result : zip(InitResultRegs, ResultRegs, PhiRegs)) {
-    B.buildInstr(TargetOpcode::G_PHI)
-      .addDef(std::get<2>(Result))
-      .addReg(std::get<0>(Result)) // Initial value / implicit_def
-      .addMBB(&MBB)
-      .addReg(std::get<1>(Result)) // Mid-loop value.
-      .addMBB(LoopBB);
-  }
-
   const DebugLoc &DL = B.getDL();
 
   MachineInstr &FirstInst = *Range.begin();
@@ -1174,18 +1162,25 @@ bool AMDGPURegisterBankInfo::applyMappingLoad(MachineInstr &MI,
       // 96-bit loads are only available for vector loads. We need to split this
       // into a 64-bit part, and 32 (unless we can widen to a 128-bit load).
       if (MMO->getAlign() < Align(16)) {
+        MachineFunction *MF = MI.getParent()->getParent();
+        ApplyRegBankMapping ApplyBank(*this, MRI, DstBank);
+        MachineIRBuilder B(MI, ApplyBank);
+        LegalizerHelper Helper(*MF, ApplyBank, B);
         LLT Part64, Part32;
         std::tie(Part64, Part32) = splitUnequalType(LoadTy, 64);
-        auto Load0 = B.buildLoadFromOffset(Part64, PtrReg, *MMO, 0);
-        auto Load1 = B.buildLoadFromOffset(Part32, PtrReg, *MMO, 8);
-
-        auto Undef = B.buildUndef(LoadTy);
-        auto Ins0 = B.buildInsert(LoadTy, Undef, Load0, 0);
-        B.buildInsert(MI.getOperand(0), Ins0, Load1, 64);
+        if (Helper.reduceLoadStoreWidth(cast<GAnyLoad>(MI), 0, Part64) !=
+            LegalizerHelper::Legalized)
+          return false;
+        return true;
       } else {
         LLT WiderTy = widen96To128(LoadTy);
         auto WideLoad = B.buildLoadFromOffset(WiderTy, PtrReg, *MMO, 0);
-        B.buildExtract(MI.getOperand(0), WideLoad, 0);
+        if (WiderTy.isScalar())
+          B.buildTrunc(MI.getOperand(0), WideLoad);
+        else {
+          B.buildDeleteTrailingVectorElements(MI.getOperand(0).getReg(),
+                                              WideLoad);
+        }
       }
     }
 
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUReplaceLDSUseWithPointer.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUReplaceLDSUseWithPointer.cpp
index d55bf3917e9c..2475b44b42a3 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUReplaceLDSUseWithPointer.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUReplaceLDSUseWithPointer.cpp
@@ -87,6 +87,7 @@
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SetOperations.h"
+#include "llvm/Analysis/CallGraph.h"
 #include "llvm/CodeGen/TargetPassConfig.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DerivedTypes.h"
@@ -110,6 +111,18 @@ using namespace llvm;
 
 namespace {
 
+namespace AMDGPU {
+/// Collect all the instructions where user \p U belongs to. \p U could be
+/// instruction itself or it could be a constant expression which is used within
+/// an instruction. If \p CollectKernelInsts is true, collect instructions only
+/// from kernels, otherwise collect instructions only from non-kernel functions.
+DenseMap<Function *, SmallPtrSet<Instruction *, 8>>
+getFunctionToInstsMap(User *U, bool CollectKernelInsts);
+
+SmallPtrSet<Function *, 8> collectNonKernelAccessorsOfLDS(GlobalVariable *GV);
+
+} // namespace AMDGPU
+
 class ReplaceLDSUseImpl {
   Module &M;
   LLVMContext &Ctx;
@@ -127,7 +140,8 @@ class ReplaceLDSUseImpl {
   // Collect LDS which requires their uses to be replaced by pointer.
   std::vector<GlobalVariable *> collectLDSRequiringPointerReplace() {
     // Collect LDS which requires module lowering.
-    std::vector<GlobalVariable *> LDSGlobals = AMDGPU::findVariablesToLower(M);
+    std::vector<GlobalVariable *> LDSGlobals =
+        llvm::AMDGPU::findVariablesToLower(M);
 
     // Remove LDS which don't qualify for replacement.
     llvm::erase_if(LDSGlobals, [&](GlobalVariable *GV) {
@@ -172,7 +186,7 @@ class ReplaceLDSUseImpl {
         AMDGPUAS::LOCAL_ADDRESS);
 
     LDSPointer->setUnnamedAddr(GlobalValue::UnnamedAddr::Global);
-    LDSPointer->setAlignment(AMDGPU::getAlign(DL, LDSPointer));
+    LDSPointer->setAlignment(llvm::AMDGPU::getAlign(DL, LDSPointer));
 
     // Mark that an associated LDS pointer is created for LDS.
     LDSToPointer[GV] = LDSPointer;
@@ -245,10 +259,9 @@ class ReplaceLDSUseImpl {
       auto FunctionToInsts =
           AMDGPU::getFunctionToInstsMap(U, false /*=CollectKernelInsts*/);
 
-      for (auto FI = FunctionToInsts.begin(), FE = FunctionToInsts.end();
-           FI != FE; ++FI) {
-        Function *F = FI->first;
-        auto &Insts = FI->second;
+      for (const auto &FunctionToInst : FunctionToInsts) {
+        Function *F = FunctionToInst.first;
+        auto &Insts = FunctionToInst.second;
         for (auto *I : Insts) {
           // If `U` is a constant expression, then we need to break the
           // associated instruction into a set of separate instructions by
@@ -341,10 +354,9 @@ bool ReplaceLDSUseImpl::replaceLDSUse(GlobalVariable *GV) {
 
   // Traverse through each kernel K, check and if required, initialize the
   // LDS pointer to point to LDS within K.
-  for (auto KI = KernelToCallees.begin(), KE = KernelToCallees.end(); KI != KE;
-       ++KI) {
-    Function *K = KI->first;
-    SmallPtrSet<Function *, 8> Callees = KI->second;
+  for (const auto &KernelToCallee : KernelToCallees) {
+    Function *K = KernelToCallee.first;
+    SmallPtrSet<Function *, 8> Callees = KernelToCallee.second;
 
     // Compute reachable and LDS used callees for kernel K.
     set_intersect(Callees, LDSAccessors);
@@ -378,6 +390,184 @@ bool ReplaceLDSUseImpl::replaceLDSUse(GlobalVariable *GV) {
   return true;
 }
 
+namespace AMDGPU {
+
+// An helper class for collecting all reachable callees for each kernel defined
+// within the module.
+class CollectReachableCallees {
+  Module &M;
+  CallGraph CG;
+  SmallPtrSet<CallGraphNode *, 8> AddressTakenFunctions;
+
+  // Collect all address taken functions within the module.
+  void collectAddressTakenFunctions() {
+    auto *ECNode = CG.getExternalCallingNode();
+
+    for (const auto &GI : *ECNode) {
+      auto *CGN = GI.second;
+      auto *F = CGN->getFunction();
+      if (!F || F->isDeclaration() || llvm::AMDGPU::isKernelCC(F))
+        continue;
+      AddressTakenFunctions.insert(CGN);
+    }
+  }
+
+  // For given kernel, collect all its reachable non-kernel functions.
+  SmallPtrSet<Function *, 8> collectReachableCallees(Function *K) {
+    SmallPtrSet<Function *, 8> ReachableCallees;
+
+    // Call graph node which represents this kernel.
+    auto *KCGN = CG[K];
+
+    // Go through all call graph nodes reachable from the node representing this
+    // kernel, visit all their call sites, if the call site is direct, add
+    // corresponding callee to reachable callee set, if it is indirect, resolve
+    // the indirect call site to potential reachable callees, add them to
+    // reachable callee set, and repeat the process for the newly added
+    // potential callee nodes.
+    //
+    // FIXME: Need to handle bit-casted function pointers.
+    //
+    SmallVector<CallGraphNode *, 8> CGNStack(depth_first(KCGN));
+    SmallPtrSet<CallGraphNode *, 8> VisitedCGNodes;
+    while (!CGNStack.empty()) {
+      auto *CGN = CGNStack.pop_back_val();
+
+      if (!VisitedCGNodes.insert(CGN).second)
+        continue;
+
+      // Ignore call graph node which does not have associated function or
+      // associated function is not a definition.
+      if (!CGN->getFunction() || CGN->getFunction()->isDeclaration())
+        continue;
+
+      for (const auto &GI : *CGN) {
+        auto *RCB = cast<CallBase>(GI.first.getValue());
+        auto *RCGN = GI.second;
+
+        if (auto *DCallee = RCGN->getFunction()) {
+          ReachableCallees.insert(DCallee);
+        } else if (RCB->isIndirectCall()) {
+          auto *RCBFTy = RCB->getFunctionType();
+          for (auto *ACGN : AddressTakenFunctions) {
+            auto *ACallee = ACGN->getFunction();
+            if (ACallee->getFunctionType() == RCBFTy) {
+              ReachableCallees.insert(ACallee);
+              CGNStack.append(df_begin(ACGN), df_end(ACGN));
+            }
+          }
+        }
+      }
+    }
+
+    return ReachableCallees;
+  }
+
+public:
+  explicit CollectReachableCallees(Module &M) : M(M), CG(CallGraph(M)) {
+    // Collect address taken functions.
+    collectAddressTakenFunctions();
+  }
+
+  void collectReachableCallees(
+      DenseMap<Function *, SmallPtrSet<Function *, 8>> &KernelToCallees) {
+    // Collect reachable callee set for each kernel defined in the module.
+    for (Function &F : M.functions()) {
+      if (!llvm::AMDGPU::isKernelCC(&F))
+        continue;
+      Function *K = &F;
+      KernelToCallees[K] = collectReachableCallees(K);
+    }
+  }
+};
+
+/// Collect reachable callees for each kernel defined in the module \p M and
+/// return collected callees at \p KernelToCallees.
+void collectReachableCallees(
+    Module &M,
+    DenseMap<Function *, SmallPtrSet<Function *, 8>> &KernelToCallees) {
+  CollectReachableCallees CRC{M};
+  CRC.collectReachableCallees(KernelToCallees);
+}
+
+/// For the given LDS global \p GV, visit all its users and collect all
+/// non-kernel functions within which \p GV is used and return collected list of
+/// such non-kernel functions.
+SmallPtrSet<Function *, 8> collectNonKernelAccessorsOfLDS(GlobalVariable *GV) {
+  SmallPtrSet<Function *, 8> LDSAccessors;
+  SmallVector<User *, 8> UserStack(GV->users());
+  SmallPtrSet<User *, 8> VisitedUsers;
+
+  while (!UserStack.empty()) {
+    auto *U = UserStack.pop_back_val();
+
+    // `U` is already visited? continue to next one.
+    if (!VisitedUsers.insert(U).second)
+      continue;
+
+    // `U` is a global variable which is initialized with LDS. Ignore LDS.
+    if (isa<GlobalValue>(U))
+      return SmallPtrSet<Function *, 8>();
+
+    // Recursively explore constant users.
+    if (isa<Constant>(U)) {
+      append_range(UserStack, U->users());
+      continue;
+    }
+
+    // `U` should be an instruction, if it belongs to a non-kernel function F,
+    // then collect F.
+    Function *F = cast<Instruction>(U)->getFunction();
+    if (!llvm::AMDGPU::isKernelCC(F))
+      LDSAccessors.insert(F);
+  }
+
+  return LDSAccessors;
+}
+
+DenseMap<Function *, SmallPtrSet<Instruction *, 8>>
+getFunctionToInstsMap(User *U, bool CollectKernelInsts) {
+  DenseMap<Function *, SmallPtrSet<Instruction *, 8>> FunctionToInsts;
+  SmallVector<User *, 8> UserStack;
+  SmallPtrSet<User *, 8> VisitedUsers;
+
+  UserStack.push_back(U);
+
+  while (!UserStack.empty()) {
+    auto *UU = UserStack.pop_back_val();
+
+    if (!VisitedUsers.insert(UU).second)
+      continue;
+
+    if (isa<GlobalValue>(UU))
+      continue;
+
+    if (isa<Constant>(UU)) {
+      append_range(UserStack, UU->users());
+      continue;
+    }
+
+    auto *I = cast<Instruction>(UU);
+    Function *F = I->getFunction();
+    if (CollectKernelInsts) {
+      if (!llvm::AMDGPU::isKernelCC(F)) {
+        continue;
+      }
+    } else {
+      if (llvm::AMDGPU::isKernelCC(F)) {
+        continue;
+      }
+    }
+
+    FunctionToInsts.insert(std::make_pair(F, SmallPtrSet<Instruction *, 8>()));
+    FunctionToInsts[F].insert(I);
+  }
+
+  return FunctionToInsts;
+}
+
+} // namespace AMDGPU
+
 // Entry-point function which interface ReplaceLDSUseImpl with outside of the
 // class.
 bool ReplaceLDSUseImpl::replaceLDSUse() {
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
index 0655b4342ba1..cd05797fdbdb 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
@@ -413,21 +413,21 @@ bool GCNSubtarget::zeroesHigh16BitsOfDest(unsigned Opcode) const {
   case AMDGPU::V_MAX_I16_e32:
   case AMDGPU::V_MIN_I16_e64:
   case AMDGPU::V_MIN_I16_e32:
+  case AMDGPU::V_MAD_F16_e64:
+  case AMDGPU::V_MAD_U16_e64:
+  case AMDGPU::V_MAD_I16_e64:
+  case AMDGPU::V_FMA_F16_e64:
+  case AMDGPU::V_DIV_FIXUP_F16_e64:
     // On gfx10, all 16-bit instructions preserve the high bits.
     return getGeneration() <= AMDGPUSubtarget::GFX9;
-  case AMDGPU::V_MAD_F16_e64:
   case AMDGPU::V_MADAK_F16:
   case AMDGPU::V_MADMK_F16:
   case AMDGPU::V_MAC_F16_e64:
   case AMDGPU::V_MAC_F16_e32:
   case AMDGPU::V_FMAMK_F16:
   case AMDGPU::V_FMAAK_F16:
-  case AMDGPU::V_MAD_U16_e64:
-  case AMDGPU::V_MAD_I16_e64:
-  case AMDGPU::V_FMA_F16_e64:
   case AMDGPU::V_FMAC_F16_e64:
   case AMDGPU::V_FMAC_F16_e32:
-  case AMDGPU::V_DIV_FIXUP_F16_e64:
     // In gfx9, the preferred handling of the unused high 16-bits changed. Most
     // instructions maintain the legacy behavior of 0ing. Some instructions
     // changed to preserving the high bits.
@@ -648,9 +648,18 @@ bool AMDGPUSubtarget::makeLIDRangeMetadata(Instruction *I) const {
 }
 
 unsigned AMDGPUSubtarget::getImplicitArgNumBytes(const Function &F) const {
+  assert(AMDGPU::isKernel(F.getCallingConv()));
+
+  // We don't allocate the segment if we know the implicit arguments weren't
+  // used, even if the ABI implies we need them.
+  if (F.hasFnAttribute("amdgpu-no-implicitarg-ptr"))
+    return 0;
+
   if (isMesaKernel(F))
     return 16;
-  return AMDGPU::getIntegerAttribute(F, "amdgpu-implicitarg-num-bytes", 0);
+
+  // Assume all implicit inputs are used by default
+  return AMDGPU::getIntegerAttribute(F, "amdgpu-implicitarg-num-bytes", 56);
 }
 
 uint64_t AMDGPUSubtarget::getExplicitKernArgSize(const Function &F,
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index de11676279f2..a2c61f9da8da 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -231,13 +231,6 @@ static cl::opt<bool, true> LateCFGStructurize(
   cl::location(AMDGPUTargetMachine::EnableLateStructurizeCFG),
   cl::Hidden);
 
-static cl::opt<bool, true> EnableAMDGPUFixedFunctionABIOpt(
-  "amdgpu-fixed-function-abi",
-  cl::desc("Enable all implicit function arguments"),
-  cl::location(AMDGPUTargetMachine::EnableFixedFunctionABI),
-  cl::init(false),
-  cl::Hidden);
-
 // Enable lib calls simplifications
 static cl::opt<bool> EnableLibCallSimplify(
   "amdgpu-simplify-libcall",
@@ -505,7 +498,6 @@ AMDGPUTargetMachine::AMDGPUTargetMachine(const Target &T, const Triple &TT,
 
 bool AMDGPUTargetMachine::EnableLateStructurizeCFG = false;
 bool AMDGPUTargetMachine::EnableFunctionCalls = false;
-bool AMDGPUTargetMachine::EnableFixedFunctionABI = false;
 bool AMDGPUTargetMachine::EnableLowerModuleLDS = true;
 
 AMDGPUTargetMachine::~AMDGPUTargetMachine() = default;
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h
index 0ff2db2a52d9..226646a96953 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h
@@ -37,7 +37,6 @@ protected:
 public:
   static bool EnableLateStructurizeCFG;
   static bool EnableFunctionCalls;
-  static bool EnableFixedFunctionABI;
   static bool EnableLowerModuleLDS;
 
   AMDGPUTargetMachine(const Target &T, const Triple &TT, StringRef CPU,
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
index ecdbdf613a53..09c5eb192e1f 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
@@ -519,57 +519,6 @@ InstructionCost GCNTTIImpl::getArithmeticInstrCost(
     TTI::OperandValueProperties Opd1PropInfo,
     TTI::OperandValueProperties Opd2PropInfo, ArrayRef<const Value *> Args,
     const Instruction *CxtI) {
-  EVT OrigTy = TLI->getValueType(DL, Ty);
-  if (!OrigTy.isSimple()) {
-    // FIXME: We're having to query the throughput cost so that the basic
-    // implementation tries to generate legalize and scalarization costs. Maybe
-    // we could hoist the scalarization code here?
-    if (CostKind != TTI::TCK_CodeSize)
-      return BaseT::getArithmeticInstrCost(Opcode, Ty, TTI::TCK_RecipThroughput,
-                                           Opd1Info, Opd2Info, Opd1PropInfo,
-                                           Opd2PropInfo, Args, CxtI);
-    // Scalarization
-
-    // Check if any of the operands are vector operands.
-    int ISD = TLI->InstructionOpcodeToISD(Opcode);
-    assert(ISD && "Invalid opcode");
-
-    std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
-
-    bool IsFloat = Ty->isFPOrFPVectorTy();
-    // Assume that floating point arithmetic operations cost twice as much as
-    // integer operations.
-    unsigned OpCost = (IsFloat ? 2 : 1);
-
-    if (TLI->isOperationLegalOrPromote(ISD, LT.second)) {
-      // The operation is legal. Assume it costs 1.
-      // TODO: Once we have extract/insert subvector cost we need to use them.
-      return LT.first * OpCost;
-    }
-
-    if (!TLI->isOperationExpand(ISD, LT.second)) {
-      // If the operation is custom lowered, then assume that the code is twice
-      // as expensive.
-      return LT.first * 2 * OpCost;
-    }
-
-    // Else, assume that we need to scalarize this op.
-    // TODO: If one of the types get legalized by splitting, handle this
-    // similarly to what getCastInstrCost() does.
-    if (auto *VTy = dyn_cast<VectorType>(Ty)) {
-      unsigned Num = cast<FixedVectorType>(VTy)->getNumElements();
-      InstructionCost Cost = getArithmeticInstrCost(
-          Opcode, VTy->getScalarType(), CostKind, Opd1Info, Opd2Info,
-          Opd1PropInfo, Opd2PropInfo, Args, CxtI);
-      // Return the cost of multiple scalar invocation plus the cost of
-      // inserting and extracting the values.
-      SmallVector<Type *> Tys(Args.size(), Ty);
-      return getScalarizationOverhead(VTy, Args, Tys) + Num * Cost;
-    }
-
-    // We don't know anything about this scalar instruction.
-    return OpCost;
-  }
 
   // Legalize the type.
   std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
@@ -742,40 +691,6 @@ GCNTTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
     return BaseT::getIntrinsicInstrCost(ICA, CostKind);
 
   Type *RetTy = ICA.getReturnType();
-  EVT OrigTy = TLI->getValueType(DL, RetTy);
-  if (!OrigTy.isSimple()) {
-    if (CostKind != TTI::TCK_CodeSize)
-      return BaseT::getIntrinsicInstrCost(ICA, CostKind);
-
-    // TODO: Combine these two logic paths.
-    if (ICA.isTypeBasedOnly())
-      return getTypeBasedIntrinsicInstrCost(ICA, CostKind);
-
-    unsigned RetVF =
-        (RetTy->isVectorTy() ? cast<FixedVectorType>(RetTy)->getNumElements()
-                             : 1);
-    const IntrinsicInst *I = ICA.getInst();
-    const SmallVectorImpl<const Value *> &Args = ICA.getArgs();
-    FastMathFlags FMF = ICA.getFlags();
-    // Assume that we need to scalarize this intrinsic.
-
-    // Compute the scalarization overhead based on Args for a vector
-    // intrinsic. A vectorizer will pass a scalar RetTy and VF > 1, while
-    // CostModel will pass a vector RetTy and VF is 1.
-    InstructionCost ScalarizationCost = InstructionCost::getInvalid();
-    if (RetVF > 1) {
-      ScalarizationCost = 0;
-      if (!RetTy->isVoidTy())
-        ScalarizationCost +=
-            getScalarizationOverhead(cast<VectorType>(RetTy), true, false);
-      ScalarizationCost +=
-          getOperandsScalarizationOverhead(Args, ICA.getArgTypes());
-    }
-
-    IntrinsicCostAttributes Attrs(ICA.getID(), RetTy, ICA.getArgTypes(), FMF, I,
-                                  ScalarizationCost);
-    return getIntrinsicInstrCost(Attrs, CostKind);
-  }
 
   // Legalize the type.
   std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, RetTy);
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDILCFGStructurizer.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDILCFGStructurizer.cpp
index 712f6dece911..1736c078eb83 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDILCFGStructurizer.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDILCFGStructurizer.cpp
@@ -173,10 +173,8 @@ protected:
   }
 
   static void PrintLoopinfo(const MachineLoopInfo &LoopInfo) {
-    for (MachineLoop::iterator iter = LoopInfo.begin(),
-         iterEnd = LoopInfo.end(); iter != iterEnd; ++iter) {
-      (*iter)->print(dbgs());
-    }
+    for (const MachineLoop *L : LoopInfo)
+      L->print(dbgs());
   }
 
   // UTILITY FUNCTIONS
@@ -691,9 +689,7 @@ bool AMDGPUCFGStructurizer::prepare() {
   SmallVector<MachineBasicBlock *, DEFAULT_VEC_SLOTS> RetBlks;
 
   // Add an ExitBlk to loop that don't have one
-  for (MachineLoopInfo::iterator It = MLI->begin(),
-       E = MLI->end(); It != E; ++It) {
-    MachineLoop *LoopRep = (*It);
+  for (MachineLoop *LoopRep : *MLI) {
     MBBVector ExitingMBBs;
     LoopRep->getExitingBlocks(ExitingMBBs);
 
@@ -827,14 +823,13 @@ bool AMDGPUCFGStructurizer::run() {
   wrapup(*GraphTraits<MachineFunction *>::nodes_begin(FuncRep));
 
   // Detach retired Block, release memory.
-  for (MBBInfoMap::iterator It = BlockInfoMap.begin(), E = BlockInfoMap.end();
-      It != E; ++It) {
-    if ((*It).second && (*It).second->IsRetired) {
-      assert(((*It).first)->getNumber() != -1);
-      LLVM_DEBUG(dbgs() << "Erase BB" << ((*It).first)->getNumber() << "\n";);
-      (*It).first->eraseFromParent();  //Remove from the parent Function.
+  for (auto &It : BlockInfoMap) {
+    if (It.second && It.second->IsRetired) {
+      assert((It.first)->getNumber() != -1);
+      LLVM_DEBUG(dbgs() << "Erase BB" << (It.first)->getNumber() << "\n";);
+      It.first->eraseFromParent(); // Remove from the parent Function.
     }
-    delete (*It).second;
+    delete It.second;
   }
   BlockInfoMap.clear();
   LLInfoMap.clear();
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
index 4acd77a9d5d2..2bb59086f391 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
@@ -246,8 +246,12 @@ public:
     return isRegKind() && !hasModifiers();
   }
 
+  bool isRegOrInline(unsigned RCID, MVT type) const {
+    return isRegClass(RCID) || isInlinableImm(type);
+  }
+
   bool isRegOrImmWithInputMods(unsigned RCID, MVT type) const {
-    return isRegClass(RCID) || isInlinableImm(type) || isLiteralImm(type);
+    return isRegOrInline(RCID, type) || isLiteralImm(type);
   }
 
   bool isRegOrImmWithInt16InputMods() const {
@@ -372,7 +376,7 @@ public:
   bool isInlineValue() const;
 
   bool isRegOrInlineNoMods(unsigned RCID, MVT type) const {
-    return (isRegClass(RCID) || isInlinableImm(type)) && !hasModifiers();
+    return isRegOrInline(RCID, type) && !hasModifiers();
   }
 
   bool isSCSrcB16() const {
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/BUFInstructions.td b/contrib/llvm-project/llvm/lib/Target/AMDGPU/BUFInstructions.td
index d3644db7cf8b..a535c8cc0918 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/BUFInstructions.td
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/BUFInstructions.td
@@ -6,11 +6,11 @@
 //
 //===----------------------------------------------------------------------===//
 
-def MUBUFAddr64 : ComplexPattern<i64, 4, "SelectMUBUFAddr64">;
-def MUBUFOffset : ComplexPattern<i64, 3, "SelectMUBUFOffset">;
+def MUBUFAddr64 : ComplexPattern<iPTR, 4, "SelectMUBUFAddr64">;
+def MUBUFOffset : ComplexPattern<iPTR, 3, "SelectMUBUFOffset">;
 
-def MUBUFScratchOffen : ComplexPattern<i64, 4, "SelectMUBUFScratchOffen", [], [SDNPWantParent]>;
-def MUBUFScratchOffset : ComplexPattern<i64, 3, "SelectMUBUFScratchOffset", [], [SDNPWantParent], 20>;
+def MUBUFScratchOffen : ComplexPattern<iPTR, 4, "SelectMUBUFScratchOffen", [], [SDNPWantParent]>;
+def MUBUFScratchOffset : ComplexPattern<iPTR, 3, "SelectMUBUFScratchOffset", [], [SDNPWantParent], 20>;
 
 def BUFAddrKind {
   int Offset = 0;
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/FLATInstructions.td b/contrib/llvm-project/llvm/lib/Target/AMDGPU/FLATInstructions.td
index bb0aa648ff90..c7ec5308e6d0 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/FLATInstructions.td
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/FLATInstructions.td
@@ -6,12 +6,12 @@
 //
 //===----------------------------------------------------------------------===//
 
-def FlatOffset : ComplexPattern<i64, 2, "SelectFlatOffset", [], [SDNPWantRoot], -10>;
-def GlobalOffset : ComplexPattern<i64, 2, "SelectGlobalOffset", [], [SDNPWantRoot], -10>;
-def ScratchOffset : ComplexPattern<i32, 2, "SelectScratchOffset", [], [SDNPWantRoot], -10>;
+def FlatOffset : ComplexPattern<iPTR, 2, "SelectFlatOffset", [], [SDNPWantRoot], -10>;
+def GlobalOffset : ComplexPattern<iPTR, 2, "SelectGlobalOffset", [], [SDNPWantRoot], -10>;
+def ScratchOffset : ComplexPattern<iPTR, 2, "SelectScratchOffset", [], [SDNPWantRoot], -10>;
 
-def GlobalSAddr : ComplexPattern<i64, 3, "SelectGlobalSAddr", [], [SDNPWantRoot], -10>;
-def ScratchSAddr : ComplexPattern<i32, 2, "SelectScratchSAddr", [], [SDNPWantRoot], -10>;
+def GlobalSAddr : ComplexPattern<iPTR, 3, "SelectGlobalSAddr", [], [SDNPWantRoot], -10>;
+def ScratchSAddr : ComplexPattern<iPTR, 2, "SelectScratchSAddr", [], [SDNPWantRoot], -10>;
 
 //===----------------------------------------------------------------------===//
 // FLAT classes
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/MCA/AMDGPUCustomBehaviour.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/MCA/AMDGPUCustomBehaviour.cpp
index f3f664f7972a..912bcc792e4d 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/MCA/AMDGPUCustomBehaviour.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/MCA/AMDGPUCustomBehaviour.cpp
@@ -120,8 +120,7 @@ unsigned AMDGPUCustomBehaviour::handleWaitCnt(ArrayRef<InstRef> IssuedInst,
 
   // We will now look at each of the currently executing instructions
   // to find out if this wait instruction still needs to wait.
-  for (auto I = IssuedInst.begin(), E = IssuedInst.end(); I != E; I++) {
-    const InstRef &PrevIR = *I;
+  for (const InstRef &PrevIR : IssuedInst) {
     const Instruction &PrevInst = *PrevIR.getInstruction();
     const unsigned PrevInstIndex = PrevIR.getSourceIndex() % SrcMgr.size();
     const WaitCntInfo &PrevInstWaitInfo = InstrWaitCntInfo[PrevInstIndex];
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600ControlFlowFinalizer.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600ControlFlowFinalizer.cpp
index 29c37c706138..8a48a67b829c 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600ControlFlowFinalizer.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600ControlFlowFinalizer.cpp
@@ -440,9 +440,8 @@ private:
     CounterPropagateAddr(*Clause.first, CfCount);
     MachineBasicBlock *BB = Clause.first->getParent();
     BuildMI(BB, DL, TII->get(R600::FETCH_CLAUSE)).addImm(CfCount);
-    for (unsigned i = 0, e = Clause.second.size(); i < e; ++i) {
-      BB->splice(InsertPos, BB, Clause.second[i]);
-    }
+    for (MachineInstr *MI : Clause.second)
+      BB->splice(InsertPos, BB, MI);
     CfCount += 2 * Clause.second.size();
   }
 
@@ -452,9 +451,8 @@ private:
     CounterPropagateAddr(*Clause.first, CfCount);
     MachineBasicBlock *BB = Clause.first->getParent();
     BuildMI(BB, DL, TII->get(R600::ALU_CLAUSE)).addImm(CfCount);
-    for (unsigned i = 0, e = Clause.second.size(); i < e; ++i) {
-      BB->splice(InsertPos, BB, Clause.second[i]);
-    }
+    for (MachineInstr *MI : Clause.second)
+      BB->splice(InsertPos, BB, MI);
     CfCount += Clause.second.size();
   }
 
@@ -635,10 +633,10 @@ public:
             CfCount++;
           }
           MI->eraseFromParent();
-          for (unsigned i = 0, e = FetchClauses.size(); i < e; i++)
-            EmitFetchClause(I, DL, FetchClauses[i], CfCount);
-          for (unsigned i = 0, e = AluClauses.size(); i < e; i++)
-            EmitALUClause(I, DL, AluClauses[i], CfCount);
+          for (ClauseFile &CF : FetchClauses)
+            EmitFetchClause(I, DL, CF, CfCount);
+          for (ClauseFile &CF : AluClauses)
+            EmitALUClause(I, DL, CF, CfCount);
           break;
         }
         default:
@@ -649,8 +647,7 @@ public:
           break;
         }
       }
-      for (unsigned i = 0, e = ToPopAfter.size(); i < e; ++i) {
-        MachineInstr *Alu = ToPopAfter[i];
+      for (MachineInstr *Alu : ToPopAfter) {
         BuildMI(MBB, Alu, MBB.findDebugLoc((MachineBasicBlock::iterator)Alu),
             TII->get(R600::CF_ALU_POP_AFTER))
             .addImm(Alu->getOperand(0).getImm())
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600InstrInfo.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600InstrInfo.cpp
index a7ebf72315cb..aec8b1ae4837 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600InstrInfo.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600InstrInfo.cpp
@@ -268,17 +268,15 @@ R600InstrInfo::getSrcs(MachineInstr &MI) const {
       {R600::OpName::src1_W, R600::OpName::src1_sel_W},
     };
 
-    for (unsigned j = 0; j < 8; j++) {
-      MachineOperand &MO =
-          MI.getOperand(getOperandIdx(MI.getOpcode(), OpTable[j][0]));
+    for (const auto &Op : OpTable) {
+      MachineOperand &MO = MI.getOperand(getOperandIdx(MI.getOpcode(), Op[0]));
       Register Reg = MO.getReg();
       if (Reg == R600::ALU_CONST) {
         MachineOperand &Sel =
-            MI.getOperand(getOperandIdx(MI.getOpcode(), OpTable[j][1]));
+            MI.getOperand(getOperandIdx(MI.getOpcode(), Op[1]));
         Result.push_back(std::make_pair(&MO, Sel.getImm()));
         continue;
       }
-
     }
     return Result;
   }
@@ -289,15 +287,14 @@ R600InstrInfo::getSrcs(MachineInstr &MI) const {
     {R600::OpName::src2, R600::OpName::src2_sel},
   };
 
-  for (unsigned j = 0; j < 3; j++) {
-    int SrcIdx = getOperandIdx(MI.getOpcode(), OpTable[j][0]);
+  for (const auto &Op : OpTable) {
+    int SrcIdx = getOperandIdx(MI.getOpcode(), Op[0]);
     if (SrcIdx < 0)
       break;
     MachineOperand &MO = MI.getOperand(SrcIdx);
     Register Reg = MO.getReg();
     if (Reg == R600::ALU_CONST) {
-      MachineOperand &Sel =
-          MI.getOperand(getOperandIdx(MI.getOpcode(), OpTable[j][1]));
+      MachineOperand &Sel = MI.getOperand(getOperandIdx(MI.getOpcode(), Op[1]));
       Result.push_back(std::make_pair(&MO, Sel.getImm()));
       continue;
     }
@@ -521,12 +518,11 @@ R600InstrInfo::fitsReadPortLimitations(const std::vector<MachineInstr *> &IG,
   ValidSwizzle.clear();
   unsigned ConstCount;
   BankSwizzle TransBS = ALU_VEC_012_SCL_210;
-  for (unsigned i = 0, e = IG.size(); i < e; ++i) {
-    IGSrcs.push_back(ExtractSrcs(*IG[i], PV, ConstCount));
-    unsigned Op = getOperandIdx(IG[i]->getOpcode(),
-        R600::OpName::bank_swizzle);
-    ValidSwizzle.push_back( (R600InstrInfo::BankSwizzle)
-        IG[i]->getOperand(Op).getImm());
+  for (MachineInstr *MI : IG) {
+    IGSrcs.push_back(ExtractSrcs(*MI, PV, ConstCount));
+    unsigned Op = getOperandIdx(MI->getOpcode(), R600::OpName::bank_swizzle);
+    ValidSwizzle.push_back(
+        (R600InstrInfo::BankSwizzle)MI->getOperand(Op).getImm());
   }
   std::vector<std::pair<int, unsigned>> TransOps;
   if (!isLastAluTrans)
@@ -542,8 +538,7 @@ R600InstrInfo::fitsReadPortLimitations(const std::vector<MachineInstr *> &IG,
     ALU_VEC_120_SCL_212,
     ALU_VEC_102_SCL_221
   };
-  for (unsigned i = 0; i < 4; i++) {
-    TransBS = TransSwz[i];
+  for (R600InstrInfo::BankSwizzle TransBS : TransSwz) {
     if (!isConstCompatible(TransBS, TransOps, ConstCount))
       continue;
     bool Result = FindSwizzleForVectorSlot(IGSrcs, ValidSwizzle, TransOps,
@@ -562,9 +557,9 @@ R600InstrInfo::fitsConstReadLimitations(const std::vector<unsigned> &Consts)
     const {
   assert (Consts.size() <= 12 && "Too many operands in instructions group");
   unsigned Pair1 = 0, Pair2 = 0;
-  for (unsigned i = 0, n = Consts.size(); i < n; ++i) {
-    unsigned ReadConstHalf = Consts[i] & 2;
-    unsigned ReadConstIndex = Consts[i] & (~3);
+  for (unsigned Const : Consts) {
+    unsigned ReadConstHalf = Const & 2;
+    unsigned ReadConstIndex = Const & (~3);
     unsigned ReadHalfConst = ReadConstIndex | ReadConstHalf;
     if (!Pair1) {
       Pair1 = ReadHalfConst;
@@ -587,12 +582,11 @@ R600InstrInfo::fitsConstReadLimitations(const std::vector<MachineInstr *> &MIs)
     const {
   std::vector<unsigned> Consts;
   SmallSet<int64_t, 4> Literals;
-  for (unsigned i = 0, n = MIs.size(); i < n; i++) {
-    MachineInstr &MI = *MIs[i];
-    if (!isALUInstr(MI.getOpcode()))
+  for (MachineInstr *MI : MIs) {
+    if (!isALUInstr(MI->getOpcode()))
       continue;
 
-    for (const auto &Src : getSrcs(MI)) {
+    for (const auto &Src : getSrcs(*MI)) {
       if (Src.first->getReg() == R600::ALU_LITERAL_X)
         Literals.insert(Src.second);
       if (Literals.size() > 4)
@@ -1330,11 +1324,11 @@ MachineInstr *R600InstrInfo::buildSlotOfVectorInstruction(
   MIB->getOperand(getOperandIdx(Opcode, R600::OpName::pred_sel))
       .setReg(MO.getReg());
 
-  for (unsigned i = 0; i < 14; i++) {
+  for (unsigned Operand : Operands) {
     MachineOperand &MO = MI->getOperand(
-        getOperandIdx(MI->getOpcode(), getSlotedOps(Operands[i], Slot)));
+        getOperandIdx(MI->getOpcode(), getSlotedOps(Operand, Slot)));
     assert (MO.isImm());
-    setImmOperand(*MIB, Operands[i], MO.getImm());
+    setImmOperand(*MIB, Operand, MO.getImm());
   }
   MIB->getOperand(20).setImm(0);
   return MIB;
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600MachineScheduler.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600MachineScheduler.cpp
index 6aee2f591b56..d26879ed8d60 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600MachineScheduler.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600MachineScheduler.cpp
@@ -328,9 +328,9 @@ SUnit *R600SchedStrategy::PopInst(std::vector<SUnit *> &Q, bool AnyALU) {
 
 void R600SchedStrategy::LoadAlu() {
   std::vector<SUnit *> &QSrc = Pending[IDAlu];
-  for (unsigned i = 0, e = QSrc.size(); i < e; ++i) {
-    AluKind AK = getAluKind(QSrc[i]);
-    AvailableAlus[AK].push_back(QSrc[i]);
+  for (SUnit *SU : QSrc) {
+    AluKind AK = getAluKind(SU);
+    AvailableAlus[AK].push_back(SU);
   }
   QSrc.clear();
 }
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600OpenCLImageTypeLoweringPass.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600OpenCLImageTypeLoweringPass.cpp
index ac6a3581e255..aa156190b7ae 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600OpenCLImageTypeLoweringPass.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600OpenCLImageTypeLoweringPass.cpp
@@ -307,8 +307,8 @@ class R600OpenCLImageTypeLoweringPass : public ModulePass {
     // Build new MDNode.
     SmallVector<Metadata *, 6> KernelMDArgs;
     KernelMDArgs.push_back(ConstantAsMetadata::get(NewF));
-    for (unsigned i = 0; i < NumKernelArgMDNodes; ++i)
-      KernelMDArgs.push_back(MDNode::get(*Context, NewArgMDs.ArgVector[i]));
+    for (const MDVector &MDV : NewArgMDs.ArgVector)
+      KernelMDArgs.push_back(MDNode::get(*Context, MDV));
     MDNode *NewMDNode = MDNode::get(*Context, KernelMDArgs);
 
     return std::make_tuple(NewF, NewMDNode);
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600OptimizeVectorRegisters.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600OptimizeVectorRegisters.cpp
index 72cf48c04e7f..795bc898a7bf 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600OptimizeVectorRegisters.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600OptimizeVectorRegisters.cpp
@@ -150,19 +150,18 @@ bool R600VectorRegMerger::tryMergeVector(const RegSeqInfo *Untouched,
     RegSeqInfo *ToMerge, std::vector< std::pair<unsigned, unsigned>> &Remap)
     const {
   unsigned CurrentUndexIdx = 0;
-  for (DenseMap<Register, unsigned>::iterator It = ToMerge->RegToChan.begin(),
-      E = ToMerge->RegToChan.end(); It != E; ++It) {
+  for (auto &It : ToMerge->RegToChan) {
     DenseMap<Register, unsigned>::const_iterator PosInUntouched =
-        Untouched->RegToChan.find((*It).first);
+        Untouched->RegToChan.find(It.first);
     if (PosInUntouched != Untouched->RegToChan.end()) {
-      Remap.push_back(std::pair<unsigned, unsigned>
-          ((*It).second, (*PosInUntouched).second));
+      Remap.push_back(
+          std::pair<unsigned, unsigned>(It.second, (*PosInUntouched).second));
       continue;
     }
     if (CurrentUndexIdx >= Untouched->UndefReg.size())
       return false;
-    Remap.push_back(std::pair<unsigned, unsigned>
-        ((*It).second, Untouched->UndefReg[CurrentUndexIdx++]));
+    Remap.push_back(std::pair<unsigned, unsigned>(
+        It.second, Untouched->UndefReg[CurrentUndexIdx++]));
   }
 
   return true;
@@ -172,9 +171,9 @@ static
 unsigned getReassignedChan(
     const std::vector<std::pair<unsigned, unsigned>> &RemapChan,
     unsigned Chan) {
-  for (unsigned j = 0, je = RemapChan.size(); j < je; j++) {
-    if (RemapChan[j].first == Chan)
-      return RemapChan[j].second;
+  for (const auto &J : RemapChan) {
+    if (J.first == Chan)
+      return J.second;
   }
   llvm_unreachable("Chan wasn't reassigned");
 }
@@ -190,11 +189,10 @@ MachineInstr *R600VectorRegMerger::RebuildVector(
   Register SrcVec = BaseRSI->Instr->getOperand(0).getReg();
   DenseMap<Register, unsigned> UpdatedRegToChan = BaseRSI->RegToChan;
   std::vector<Register> UpdatedUndef = BaseRSI->UndefReg;
-  for (DenseMap<Register, unsigned>::iterator It = RSI->RegToChan.begin(),
-      E = RSI->RegToChan.end(); It != E; ++It) {
+  for (const auto &It : RSI->RegToChan) {
     Register DstReg = MRI->createVirtualRegister(&R600::R600_Reg128RegClass);
-    unsigned SubReg = (*It).first;
-    unsigned Swizzle = (*It).second;
+    unsigned SubReg = It.first;
+    unsigned Swizzle = It.second;
     unsigned Chan = getReassignedChan(RemapChan, Swizzle);
 
     MachineInstr *Tmp = BuildMI(MBB, Pos, DL, TII->get(R600::INSERT_SUBREG),
@@ -234,14 +232,12 @@ MachineInstr *R600VectorRegMerger::RebuildVector(
 }
 
 void R600VectorRegMerger::RemoveMI(MachineInstr *MI) {
-  for (InstructionSetMap::iterator It = PreviousRegSeqByReg.begin(),
-      E = PreviousRegSeqByReg.end(); It != E; ++It) {
-    std::vector<MachineInstr *> &MIs = (*It).second;
+  for (auto &It : PreviousRegSeqByReg) {
+    std::vector<MachineInstr *> &MIs = It.second;
     MIs.erase(llvm::find(MIs, MI), MIs.end());
   }
-  for (InstructionSetMap::iterator It = PreviousRegSeqByUndefCount.begin(),
-      E = PreviousRegSeqByUndefCount.end(); It != E; ++It) {
-    std::vector<MachineInstr *> &MIs = (*It).second;
+  for (auto &It : PreviousRegSeqByUndefCount) {
+    std::vector<MachineInstr *> &MIs = It.second;
     MIs.erase(llvm::find(MIs, MI), MIs.end());
   }
 }
@@ -255,9 +251,9 @@ void R600VectorRegMerger::SwizzleInput(MachineInstr &MI,
     Offset = 3;
   for (unsigned i = 0; i < 4; i++) {
     unsigned Swizzle = MI.getOperand(i + Offset).getImm() + 1;
-    for (unsigned j = 0, e = RemapChan.size(); j < e; j++) {
-      if (RemapChan[j].first == Swizzle) {
-        MI.getOperand(i + Offset).setImm(RemapChan[j].second - 1);
+    for (const auto &J : RemapChan) {
+      if (J.first == Swizzle) {
+        MI.getOperand(i + Offset).setImm(J.second - 1);
         break;
       }
     }
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600Packetizer.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600Packetizer.cpp
index beb0aad86e89..fbe2a1cd9fba 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600Packetizer.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600Packetizer.cpp
@@ -127,8 +127,8 @@ private:
       R600::OpName::src1,
       R600::OpName::src2
     };
-    for (unsigned i = 0; i < 3; i++) {
-      int OperandIdx = TII->getOperandIdx(MI.getOpcode(), Ops[i]);
+    for (unsigned Op : Ops) {
+      int OperandIdx = TII->getOperandIdx(MI.getOpcode(), Op);
       if (OperandIdx < 0)
         continue;
       Register Src = MI.getOperand(OperandIdx).getReg();
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600RegisterInfo.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600RegisterInfo.cpp
index 99a1a8e9871a..c329bae50f92 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600RegisterInfo.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600RegisterInfo.cpp
@@ -54,10 +54,8 @@ BitVector R600RegisterInfo::getReservedRegs(const MachineFunction &MF) const {
   reserveRegisterTuples(Reserved, R600::PRED_SEL_ONE);
   reserveRegisterTuples(Reserved, R600::INDIRECT_BASE_ADDR);
 
-  for (TargetRegisterClass::iterator I = R600::R600_AddrRegClass.begin(),
-                        E = R600::R600_AddrRegClass.end(); I != E; ++I) {
-    reserveRegisterTuples(Reserved, *I);
-  }
+  for (MCPhysReg R : R600::R600_AddrRegClass)
+    reserveRegisterTuples(Reserved, R);
 
   TII->reserveIndirectRegisters(Reserved, MF, *this);
 
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
index 200e00ee5521..1f93284fc7ee 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
@@ -1620,7 +1620,7 @@ bool SIFoldOperands::tryFoldRegSequence(MachineInstr &MI) {
   // Erase the REG_SEQUENCE eagerly, unless we followed a chain of COPY users,
   // in which case we can erase them all later in runOnMachineFunction.
   if (MRI->use_nodbg_empty(MI.getOperand(0).getReg()))
-    MI.eraseFromParentAndMarkDBGValuesForRemoval();
+    MI.eraseFromParent();
   return true;
 }
 
@@ -1821,7 +1821,7 @@ bool SIFoldOperands::runOnMachineFunction(MachineFunction &MF) {
       while (MRI->use_nodbg_empty(InstToErase->getOperand(0).getReg())) {
         auto &SrcOp = InstToErase->getOperand(1);
         auto SrcReg = SrcOp.isReg() ? SrcOp.getReg() : Register();
-        InstToErase->eraseFromParentAndMarkDBGValuesForRemoval();
+        InstToErase->eraseFromParent();
         InstToErase = nullptr;
         if (!SrcReg || SrcReg.isPhysical())
           break;
@@ -1831,7 +1831,7 @@ bool SIFoldOperands::runOnMachineFunction(MachineFunction &MF) {
       }
       if (InstToErase && InstToErase->isRegSequence() &&
           MRI->use_nodbg_empty(InstToErase->getOperand(0).getReg()))
-        InstToErase->eraseFromParentAndMarkDBGValuesForRemoval();
+        InstToErase->eraseFromParent();
     }
   }
   return true;
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
index 4706c74be721..d4fe74ecb96e 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
@@ -1167,11 +1167,13 @@ void SIFrameLowering::processFunctionBeforeFrameFinalized(
   if (SpillVGPRToAGPR) {
     // To track the spill frame indices handled in this pass.
     BitVector SpillFIs(MFI.getObjectIndexEnd(), false);
+    BitVector NonVGPRSpillFIs(MFI.getObjectIndexEnd(), false);
 
     bool SeenDbgInstr = false;
 
     for (MachineBasicBlock &MBB : MF) {
       for (MachineInstr &MI : llvm::make_early_inc_range(MBB)) {
+        int FrameIndex;
         if (MI.isDebugInstr())
           SeenDbgInstr = true;
 
@@ -1191,10 +1193,18 @@ void SIFrameLowering::processFunctionBeforeFrameFinalized(
             SpillFIs.set(FI);
             continue;
           }
-        }
+        } else if (TII->isStoreToStackSlot(MI, FrameIndex) ||
+                   TII->isLoadFromStackSlot(MI, FrameIndex))
+          NonVGPRSpillFIs.set(FrameIndex);
       }
     }
 
+    // Stack slot coloring may assign different objets to the same stack slot.
+    // If not, then the VGPR to AGPR spill slot is dead.
+    for (unsigned FI : SpillFIs.set_bits())
+      if (!NonVGPRSpillFIs.test(FI))
+        FuncInfo->setVGPRToAGPRSpillDead(FI);
+
     for (MachineBasicBlock &MBB : MF) {
       for (MCPhysReg Reg : FuncInfo->getVGPRSpillAGPRs())
         MBB.addLiveIn(Reg);
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 35b72f5d201b..9f138136e6e9 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -24,6 +24,7 @@
 #include "llvm/CodeGen/Analysis.h"
 #include "llvm/CodeGen/FunctionLoweringInfo.h"
 #include "llvm/CodeGen/GlobalISel/GISelKnownBits.h"
+#include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineLoopInfo.h"
 #include "llvm/IR/DiagnosticInfo.h"
@@ -2062,33 +2063,30 @@ void SITargetLowering::allocateSpecialInputSGPRs(
   SIMachineFunctionInfo &Info) const {
   auto &ArgInfo = Info.getArgInfo();
 
-  // We need to allocate these in place regardless of their use.
-  const bool IsFixed = AMDGPUTargetMachine::EnableFixedFunctionABI;
-
   // TODO: Unify handling with private memory pointers.
-  if (IsFixed || Info.hasDispatchPtr())
+  if (Info.hasDispatchPtr())
     allocateSGPR64Input(CCInfo, ArgInfo.DispatchPtr);
 
-  if (IsFixed || Info.hasQueuePtr())
+  if (Info.hasQueuePtr())
     allocateSGPR64Input(CCInfo, ArgInfo.QueuePtr);
 
   // Implicit arg ptr takes the place of the kernarg segment pointer. This is a
   // constant offset from the kernarg segment.
-  if (IsFixed || Info.hasImplicitArgPtr())
+  if (Info.hasImplicitArgPtr())
     allocateSGPR64Input(CCInfo, ArgInfo.ImplicitArgPtr);
 
-  if (IsFixed || Info.hasDispatchID())
+  if (Info.hasDispatchID())
     allocateSGPR64Input(CCInfo, ArgInfo.DispatchID);
 
   // flat_scratch_init is not applicable for non-kernel functions.
 
-  if (IsFixed || Info.hasWorkGroupIDX())
+  if (Info.hasWorkGroupIDX())
     allocateSGPR32Input(CCInfo, ArgInfo.WorkGroupIDX);
 
-  if (IsFixed || Info.hasWorkGroupIDY())
+  if (Info.hasWorkGroupIDY())
     allocateSGPR32Input(CCInfo, ArgInfo.WorkGroupIDY);
 
-  if (IsFixed || Info.hasWorkGroupIDZ())
+  if (Info.hasWorkGroupIDZ())
     allocateSGPR32Input(CCInfo, ArgInfo.WorkGroupIDZ);
 }
 
@@ -2419,10 +2417,9 @@ SDValue SITargetLowering::LowerFormalArguments(
   if (IsEntryFunc) {
     allocateSpecialEntryInputVGPRs(CCInfo, MF, *TRI, *Info);
     allocateHSAUserSGPRs(CCInfo, MF, *TRI, *Info);
-  } else {
+  } else if (!IsGraphics) {
     // For the fixed ABI, pass workitem IDs in the last argument register.
-    if (AMDGPUTargetMachine::EnableFixedFunctionABI)
-      allocateSpecialInputVGPRsFixed(CCInfo, MF, *TRI, *Info);
+    allocateSpecialInputVGPRsFixed(CCInfo, MF, *TRI, *Info);
   }
 
   if (IsKernel) {
@@ -2549,17 +2546,13 @@ SDValue SITargetLowering::LowerFormalArguments(
     InVals.push_back(Val);
   }
 
-  if (!IsEntryFunc && !AMDGPUTargetMachine::EnableFixedFunctionABI) {
-    // Special inputs come after user arguments.
-    allocateSpecialInputVGPRs(CCInfo, MF, *TRI, *Info);
-  }
-
   // Start adding system SGPRs.
   if (IsEntryFunc) {
     allocateSystemSGPRs(CCInfo, MF, *Info, CallConv, IsGraphics);
   } else {
     CCInfo.AllocateReg(Info->getScratchRSrcReg());
-    allocateSpecialInputSGPRs(CCInfo, MF, *TRI, *Info);
+    if (!IsGraphics)
+      allocateSpecialInputSGPRs(CCInfo, MF, *TRI, *Info);
   }
 
   auto &ArgUsageInfo =
@@ -3123,8 +3116,7 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,
   CCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext());
   CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, IsVarArg);
 
-  if (AMDGPUTargetMachine::EnableFixedFunctionABI &&
-      CallConv != CallingConv::AMDGPU_Gfx) {
+  if (CallConv != CallingConv::AMDGPU_Gfx) {
     // With a fixed ABI, allocate fixed registers before user arguments.
     passSpecialInputs(CLI, CCInfo, *Info, RegsToPass, MemOpChains, Chain);
   }
@@ -3263,12 +3255,6 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,
     }
   }
 
-  if (!AMDGPUTargetMachine::EnableFixedFunctionABI &&
-      CallConv != CallingConv::AMDGPU_Gfx) {
-    // Copy special input registers after user input arguments.
-    passSpecialInputs(CLI, CCInfo, *Info, RegsToPass, MemOpChains, Chain);
-  }
-
   if (!MemOpChains.empty())
     Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOpChains);
 
@@ -6282,10 +6268,6 @@ SDValue SITargetLowering::lowerImage(SDValue Op,
     }
   }
 
-  // Push back extra arguments.
-  for (unsigned I = Intr->VAddrStart; I < Intr->GradientStart; I++)
-    VAddrs.push_back(Op.getOperand(ArgOffset + I));
-
   // Check for 16 bit addresses or derivatives and pack if true.
   MVT VAddrVT =
       Op.getOperand(ArgOffset + Intr->GradientStart).getSimpleValueType();
@@ -6298,6 +6280,17 @@ SDValue SITargetLowering::lowerImage(SDValue Op,
   MVT AddrPackVectorVT = VAddrScalarVT == MVT::f16 ? MVT::v2f16 : MVT::v2i16;
   IsA16 = VAddrScalarVT == MVT::f16 || VAddrScalarVT == MVT::i16;
 
+  // Push back extra arguments.
+  for (unsigned I = Intr->VAddrStart; I < Intr->GradientStart; I++) {
+    if (IsA16 && (Op.getOperand(ArgOffset + I).getValueType() == MVT::f16)) {
+      // Special handling of bias when A16 is on. Bias is of type half but
+      // occupies full 32-bit.
+      SDValue bias = DAG.getBuildVector( MVT::v2f16, DL, {Op.getOperand(ArgOffset + I), DAG.getUNDEF(MVT::f16)});
+      VAddrs.push_back(bias);
+    } else
+      VAddrs.push_back(Op.getOperand(ArgOffset + I));
+  }
+
   if (BaseOpcode->Gradients && !ST->hasG16() && (IsA16 != IsG16)) {
     // 16 bit gradients are supported, but are tied to the A16 control
     // so both gradients and addresses must be 16 bit
@@ -7502,8 +7495,8 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
 
     assert(NodePtr.getValueType() == MVT::i32 ||
            NodePtr.getValueType() == MVT::i64);
-    assert(RayDir.getValueType() == MVT::v4f16 ||
-           RayDir.getValueType() == MVT::v4f32);
+    assert(RayDir.getValueType() == MVT::v3f16 ||
+           RayDir.getValueType() == MVT::v3f32);
 
     if (!Subtarget->hasGFX10_AEncoding()) {
       emitRemovedIntrinsicError(DAG, DL, Op.getValueType());
@@ -9837,11 +9830,13 @@ bool SITargetLowering::isCanonicalized(Register Reg, MachineFunction &MF,
   if (Opcode == AMDGPU::G_FCANONICALIZE)
     return true;
 
-  if (Opcode == AMDGPU::G_FCONSTANT) {
-    auto F = MI->getOperand(1).getFPImm()->getValueAPF();
-    if (F.isNaN() && F.isSignaling())
+  Optional<FPValueAndVReg> FCR;
+  // Constant splat (can be padded with undef) or scalar constant.
+  if (mi_match(Reg, MRI, MIPatternMatch::m_GFCstOrSplat(FCR))) {
+    if (FCR->Value.isSignaling())
       return false;
-    return !F.isDenormal() || denormalsEnabledForType(MRI.getType(Reg), MF);
+    return !FCR->Value.isDenormal() ||
+           denormalsEnabledForType(MRI.getType(FCR->VReg), MF);
   }
 
   if (MaxDepth == 0)
@@ -11514,7 +11509,7 @@ void SITargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI,
     // Prefer VGPRs over AGPRs in mAI instructions where possible.
     // This saves a chain-copy of registers and better ballance register
     // use between vgpr and agpr as agpr tuples tend to be big.
-    if (const MCOperandInfo *OpInfo = MI.getDesc().OpInfo) {
+    if (MI.getDesc().OpInfo) {
       unsigned Opc = MI.getOpcode();
       const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
       for (auto I : { AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0),
@@ -12477,6 +12472,6 @@ SITargetLowering::getTypeLegalizationCost(const DataLayout &DL,
   if (Size <= 256)
     return Cost;
 
-  Cost.first = (Size + 255) / 256;
+  Cost.first += (Size + 255) / 256;
   return Cost;
 }
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
index c9d9dd1fb82c..6fbe5d45ce0a 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -30,6 +30,7 @@
 #include "Utils/AMDGPUBaseInfo.h"
 #include "llvm/ADT/MapVector.h"
 #include "llvm/ADT/PostOrderIterator.h"
+#include "llvm/ADT/Sequence.h"
 #include "llvm/CodeGen/MachinePostDominators.h"
 #include "llvm/InitializePasses.h"
 #include "llvm/Support/DebugCounter.h"
@@ -51,26 +52,6 @@ static cl::opt<bool> ForceEmitZeroFlag(
   cl::init(false), cl::Hidden);
 
 namespace {
-
-template <typename EnumT>
-class enum_iterator
-    : public iterator_facade_base<enum_iterator<EnumT>,
-                                  std::forward_iterator_tag, const EnumT> {
-  EnumT Value;
-public:
-  enum_iterator() = default;
-  enum_iterator(EnumT Value) : Value(Value) {}
-
-  enum_iterator &operator++() {
-    Value = static_cast<EnumT>(Value + 1);
-    return *this;
-  }
-
-  bool operator==(const enum_iterator &RHS) const { return Value == RHS.Value; }
-
-  EnumT operator*() const { return Value; }
-};
-
 // Class of object that encapsulates latest instruction counter score
 // associated with the operand.  Used for determining whether
 // s_waitcnt instruction needs to be emitted.
@@ -78,27 +59,32 @@ public:
 #define CNT_MASK(t) (1u << (t))
 
 enum InstCounterType { VM_CNT = 0, LGKM_CNT, EXP_CNT, VS_CNT, NUM_INST_CNTS };
+} // namespace
 
-iterator_range<enum_iterator<InstCounterType>> inst_counter_types() {
-  return make_range(enum_iterator<InstCounterType>(VM_CNT),
-                    enum_iterator<InstCounterType>(NUM_INST_CNTS));
-}
+namespace llvm {
+template <> struct enum_iteration_traits<InstCounterType> {
+  static constexpr bool is_iterable = true;
+};
+} // namespace llvm
+
+namespace {
+auto inst_counter_types() { return enum_seq(VM_CNT, NUM_INST_CNTS); }
 
 using RegInterval = std::pair<int, int>;
 
-struct {
+struct HardwareLimits {
   unsigned VmcntMax;
   unsigned ExpcntMax;
   unsigned LgkmcntMax;
   unsigned VscntMax;
-} HardwareLimits;
+};
 
-struct {
+struct RegisterEncoding {
   unsigned VGPR0;
   unsigned VGPRL;
   unsigned SGPR0;
   unsigned SGPRL;
-} RegisterEncoding;
+};
 
 enum WaitEventType {
   VMEM_ACCESS,      // vector-memory read & write
@@ -194,18 +180,20 @@ void addWait(AMDGPU::Waitcnt &Wait, InstCounterType T, unsigned Count) {
 // "s_waitcnt 0" before use.
 class WaitcntBrackets {
 public:
-  WaitcntBrackets(const GCNSubtarget *SubTarget) : ST(SubTarget) {}
+  WaitcntBrackets(const GCNSubtarget *SubTarget, HardwareLimits Limits,
+                  RegisterEncoding Encoding)
+      : ST(SubTarget), Limits(Limits), Encoding(Encoding) {}
 
-  static unsigned getWaitCountMax(InstCounterType T) {
+  unsigned getWaitCountMax(InstCounterType T) const {
     switch (T) {
     case VM_CNT:
-      return HardwareLimits.VmcntMax;
+      return Limits.VmcntMax;
     case LGKM_CNT:
-      return HardwareLimits.LgkmcntMax;
+      return Limits.LgkmcntMax;
     case EXP_CNT:
-      return HardwareLimits.ExpcntMax;
+      return Limits.ExpcntMax;
     case VS_CNT:
-      return HardwareLimits.VscntMax;
+      return Limits.VscntMax;
     default:
       break;
     }
@@ -338,6 +326,8 @@ private:
                    unsigned OpNo, unsigned Val);
 
   const GCNSubtarget *ST = nullptr;
+  HardwareLimits Limits = {};
+  RegisterEncoding Encoding = {};
   unsigned ScoreLBs[NUM_INST_CNTS] = {0};
   unsigned ScoreUBs[NUM_INST_CNTS] = {0};
   unsigned PendingEvents = 0;
@@ -471,14 +461,14 @@ RegInterval WaitcntBrackets::getRegInterval(const MachineInstr *MI,
   unsigned Reg = TRI->getEncodingValue(AMDGPU::getMCReg(Op.getReg(), *ST));
 
   if (TRI->isVectorRegister(*MRI, Op.getReg())) {
-    assert(Reg >= RegisterEncoding.VGPR0 && Reg <= RegisterEncoding.VGPRL);
-    Result.first = Reg - RegisterEncoding.VGPR0;
+    assert(Reg >= Encoding.VGPR0 && Reg <= Encoding.VGPRL);
+    Result.first = Reg - Encoding.VGPR0;
     if (TRI->isAGPR(*MRI, Op.getReg()))
       Result.first += AGPR_OFFSET;
     assert(Result.first >= 0 && Result.first < SQ_MAX_PGM_VGPRS);
   } else if (TRI->isSGPRReg(*MRI, Op.getReg())) {
-    assert(Reg >= RegisterEncoding.SGPR0 && Reg < SQ_MAX_PGM_SGPRS);
-    Result.first = Reg - RegisterEncoding.SGPR0 + NUM_ALL_VGPRS;
+    assert(Reg >= Encoding.SGPR0 && Reg < SQ_MAX_PGM_SGPRS);
+    Result.first = Reg - Encoding.SGPR0 + NUM_ALL_VGPRS;
     assert(Result.first >= NUM_ALL_VGPRS &&
            Result.first < SQ_MAX_PGM_SGPRS + NUM_ALL_VGPRS);
   }
@@ -1589,20 +1579,22 @@ bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) {
   for (auto T : inst_counter_types())
     ForceEmitWaitcnt[T] = false;
 
-  HardwareLimits.VmcntMax = AMDGPU::getVmcntBitMask(IV);
-  HardwareLimits.ExpcntMax = AMDGPU::getExpcntBitMask(IV);
-  HardwareLimits.LgkmcntMax = AMDGPU::getLgkmcntBitMask(IV);
-  HardwareLimits.VscntMax = ST->hasVscnt() ? 63 : 0;
+  HardwareLimits Limits = {};
+  Limits.VmcntMax = AMDGPU::getVmcntBitMask(IV);
+  Limits.ExpcntMax = AMDGPU::getExpcntBitMask(IV);
+  Limits.LgkmcntMax = AMDGPU::getLgkmcntBitMask(IV);
+  Limits.VscntMax = ST->hasVscnt() ? 63 : 0;
 
   unsigned NumVGPRsMax = ST->getAddressableNumVGPRs();
   unsigned NumSGPRsMax = ST->getAddressableNumSGPRs();
   assert(NumVGPRsMax <= SQ_MAX_PGM_VGPRS);
   assert(NumSGPRsMax <= SQ_MAX_PGM_SGPRS);
 
-  RegisterEncoding.VGPR0 = TRI->getEncodingValue(AMDGPU::VGPR0);
-  RegisterEncoding.VGPRL = RegisterEncoding.VGPR0 + NumVGPRsMax - 1;
-  RegisterEncoding.SGPR0 = TRI->getEncodingValue(AMDGPU::SGPR0);
-  RegisterEncoding.SGPRL = RegisterEncoding.SGPR0 + NumSGPRsMax - 1;
+  RegisterEncoding Encoding = {};
+  Encoding.VGPR0 = TRI->getEncodingValue(AMDGPU::VGPR0);
+  Encoding.VGPRL = Encoding.VGPR0 + NumVGPRsMax - 1;
+  Encoding.SGPR0 = TRI->getEncodingValue(AMDGPU::SGPR0);
+  Encoding.SGPRL = Encoding.SGPR0 + NumSGPRsMax - 1;
 
   TrackedWaitcntSet.clear();
   BlockInfos.clear();
@@ -1652,9 +1644,9 @@ bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) {
           *Brackets = *BI.Incoming;
       } else {
         if (!Brackets)
-          Brackets = std::make_unique<WaitcntBrackets>(ST);
+          Brackets = std::make_unique<WaitcntBrackets>(ST, Limits, Encoding);
         else
-          *Brackets = WaitcntBrackets(ST);
+          *Brackets = WaitcntBrackets(ST, Limits, Encoding);
       }
 
       Modified |= insertWaitcntInBlock(MF, *BI.MBB, *Brackets);
@@ -1686,45 +1678,47 @@ bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) {
     }
   } while (Repeat);
 
-  SmallVector<MachineBasicBlock *, 4> EndPgmBlocks;
-
-  bool HaveScalarStores = false;
+  if (ST->hasScalarStores()) {
+    SmallVector<MachineBasicBlock *, 4> EndPgmBlocks;
+    bool HaveScalarStores = false;
 
-  for (MachineBasicBlock &MBB : MF) {
-    for (MachineInstr &MI : MBB) {
-      if (!HaveScalarStores && TII->isScalarStore(MI))
-        HaveScalarStores = true;
+    for (MachineBasicBlock &MBB : MF) {
+      for (MachineInstr &MI : MBB) {
+        if (!HaveScalarStores && TII->isScalarStore(MI))
+          HaveScalarStores = true;
 
-      if (MI.getOpcode() == AMDGPU::S_ENDPGM ||
-          MI.getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG)
-        EndPgmBlocks.push_back(&MBB);
+        if (MI.getOpcode() == AMDGPU::S_ENDPGM ||
+            MI.getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG)
+          EndPgmBlocks.push_back(&MBB);
+      }
     }
-  }
 
-  if (HaveScalarStores) {
-    // If scalar writes are used, the cache must be flushed or else the next
-    // wave to reuse the same scratch memory can be clobbered.
-    //
-    // Insert s_dcache_wb at wave termination points if there were any scalar
-    // stores, and only if the cache hasn't already been flushed. This could be
-    // improved by looking across blocks for flushes in postdominating blocks
-    // from the stores but an explicitly requested flush is probably very rare.
-    for (MachineBasicBlock *MBB : EndPgmBlocks) {
-      bool SeenDCacheWB = false;
-
-      for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end(); I != E;
-           ++I) {
-        if (I->getOpcode() == AMDGPU::S_DCACHE_WB)
-          SeenDCacheWB = true;
-        else if (TII->isScalarStore(*I))
-          SeenDCacheWB = false;
-
-        // FIXME: It would be better to insert this before a waitcnt if any.
-        if ((I->getOpcode() == AMDGPU::S_ENDPGM ||
-             I->getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG) &&
-            !SeenDCacheWB) {
-          Modified = true;
-          BuildMI(*MBB, I, I->getDebugLoc(), TII->get(AMDGPU::S_DCACHE_WB));
+    if (HaveScalarStores) {
+      // If scalar writes are used, the cache must be flushed or else the next
+      // wave to reuse the same scratch memory can be clobbered.
+      //
+      // Insert s_dcache_wb at wave termination points if there were any scalar
+      // stores, and only if the cache hasn't already been flushed. This could
+      // be improved by looking across blocks for flushes in postdominating
+      // blocks from the stores but an explicitly requested flush is probably
+      // very rare.
+      for (MachineBasicBlock *MBB : EndPgmBlocks) {
+        bool SeenDCacheWB = false;
+
+        for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end();
+             I != E; ++I) {
+          if (I->getOpcode() == AMDGPU::S_DCACHE_WB)
+            SeenDCacheWB = true;
+          else if (TII->isScalarStore(*I))
+            SeenDCacheWB = false;
+
+          // FIXME: It would be better to insert this before a waitcnt if any.
+          if ((I->getOpcode() == AMDGPU::S_ENDPGM ||
+               I->getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG) &&
+              !SeenDCacheWB) {
+            Modified = true;
+            BuildMI(*MBB, I, I->getDebugLoc(), TII->get(AMDGPU::S_DCACHE_WB));
+          }
         }
       }
     }
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 92f5322b8ad2..1755b93538ce 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -899,8 +899,12 @@ void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
   unsigned EltSize = 4;
   unsigned Opcode = AMDGPU::V_MOV_B32_e32;
   if (RI.isAGPRClass(RC)) {
-    Opcode = (RI.hasVGPRs(SrcRC)) ?
-      AMDGPU::V_ACCVGPR_WRITE_B32_e64 : AMDGPU::INSTRUCTION_LIST_END;
+    if (ST.hasGFX90AInsts() && RI.isAGPRClass(SrcRC))
+      Opcode = AMDGPU::V_ACCVGPR_MOV_B32;
+    else if (RI.hasVGPRs(SrcRC))
+      Opcode = AMDGPU::V_ACCVGPR_WRITE_B32_e64;
+    else
+      Opcode = AMDGPU::INSTRUCTION_LIST_END;
   } else if (RI.hasVGPRs(RC) && RI.isAGPRClass(SrcRC)) {
     Opcode = AMDGPU::V_ACCVGPR_READ_B32_e64;
   } else if ((Size % 64 == 0) && RI.hasVGPRs(RC) &&
@@ -1417,6 +1421,33 @@ static unsigned getAGPRSpillSaveOpcode(unsigned Size) {
   }
 }
 
+static unsigned getAVSpillSaveOpcode(unsigned Size) {
+  switch (Size) {
+  case 4:
+    return AMDGPU::SI_SPILL_AV32_SAVE;
+  case 8:
+    return AMDGPU::SI_SPILL_AV64_SAVE;
+  case 12:
+    return AMDGPU::SI_SPILL_AV96_SAVE;
+  case 16:
+    return AMDGPU::SI_SPILL_AV128_SAVE;
+  case 20:
+    return AMDGPU::SI_SPILL_AV160_SAVE;
+  case 24:
+    return AMDGPU::SI_SPILL_AV192_SAVE;
+  case 28:
+    return AMDGPU::SI_SPILL_AV224_SAVE;
+  case 32:
+    return AMDGPU::SI_SPILL_AV256_SAVE;
+  case 64:
+    return AMDGPU::SI_SPILL_AV512_SAVE;
+  case 128:
+    return AMDGPU::SI_SPILL_AV1024_SAVE;
+  default:
+    llvm_unreachable("unknown register size");
+  }
+}
+
 void SIInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
                                       MachineBasicBlock::iterator MI,
                                       Register SrcReg, bool isKill,
@@ -1463,21 +1494,11 @@ void SIInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
     return;
   }
 
-  unsigned Opcode = RI.isAGPRClass(RC) ? getAGPRSpillSaveOpcode(SpillSize)
-                                       : getVGPRSpillSaveOpcode(SpillSize);
+  unsigned Opcode = RI.isVectorSuperClass(RC) ? getAVSpillSaveOpcode(SpillSize)
+                    : RI.isAGPRClass(RC) ? getAGPRSpillSaveOpcode(SpillSize)
+                                         : getVGPRSpillSaveOpcode(SpillSize);
   MFI->setHasSpilledVGPRs();
 
-  if (RI.isVectorSuperClass(RC)) {
-    // Convert an AV spill into a VGPR spill. Introduce a copy from AV to an
-    // equivalent VGPR register beforehand. Regalloc might want to introduce
-    // AV spills only to be relevant until rewriter at which they become
-    // either spills of VGPRs or AGPRs.
-    Register TmpVReg = MRI.createVirtualRegister(RI.getEquivalentVGPRClass(RC));
-    BuildMI(MBB, MI, DL, get(TargetOpcode::COPY), TmpVReg)
-        .addReg(SrcReg, RegState::Kill);
-    SrcReg = TmpVReg;
-  }
-
   BuildMI(MBB, MI, DL, get(Opcode))
     .addReg(SrcReg, getKillRegState(isKill)) // data
     .addFrameIndex(FrameIndex)               // addr
@@ -1567,6 +1588,33 @@ static unsigned getAGPRSpillRestoreOpcode(unsigned Size) {
   }
 }
 
+static unsigned getAVSpillRestoreOpcode(unsigned Size) {
+  switch (Size) {
+  case 4:
+    return AMDGPU::SI_SPILL_AV32_RESTORE;
+  case 8:
+    return AMDGPU::SI_SPILL_AV64_RESTORE;
+  case 12:
+    return AMDGPU::SI_SPILL_AV96_RESTORE;
+  case 16:
+    return AMDGPU::SI_SPILL_AV128_RESTORE;
+  case 20:
+    return AMDGPU::SI_SPILL_AV160_RESTORE;
+  case 24:
+    return AMDGPU::SI_SPILL_AV192_RESTORE;
+  case 28:
+    return AMDGPU::SI_SPILL_AV224_RESTORE;
+  case 32:
+    return AMDGPU::SI_SPILL_AV256_RESTORE;
+  case 64:
+    return AMDGPU::SI_SPILL_AV512_RESTORE;
+  case 128:
+    return AMDGPU::SI_SPILL_AV1024_RESTORE;
+  default:
+    llvm_unreachable("unknown register size");
+  }
+}
+
 void SIInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
                                        MachineBasicBlock::iterator MI,
                                        Register DestReg, int FrameIndex,
@@ -1609,26 +1657,15 @@ void SIInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
     return;
   }
 
-  unsigned Opcode = RI.isAGPRClass(RC) ? getAGPRSpillRestoreOpcode(SpillSize)
-                                       : getVGPRSpillRestoreOpcode(SpillSize);
-
-  bool IsVectorSuperClass = RI.isVectorSuperClass(RC);
-  Register TmpReg = DestReg;
-  if (IsVectorSuperClass) {
-    // For AV classes, insert the spill restore to a VGPR followed by a copy
-    // into an equivalent AV register.
-    MachineRegisterInfo &MRI = MF->getRegInfo();
-    DestReg = MRI.createVirtualRegister(RI.getEquivalentVGPRClass(RC));
-  }
+  unsigned Opcode = RI.isVectorSuperClass(RC)
+                        ? getAVSpillRestoreOpcode(SpillSize)
+                    : RI.isAGPRClass(RC) ? getAGPRSpillRestoreOpcode(SpillSize)
+                                         : getVGPRSpillRestoreOpcode(SpillSize);
   BuildMI(MBB, MI, DL, get(Opcode), DestReg)
-    .addFrameIndex(FrameIndex)        // vaddr
-    .addReg(MFI->getStackPtrOffsetReg()) // scratch_offset
-    .addImm(0)                           // offset
-    .addMemOperand(MMO);
-
-  if (IsVectorSuperClass)
-    BuildMI(MBB, MI, DL, get(TargetOpcode::COPY), TmpReg)
-        .addReg(DestReg, RegState::Kill);
+      .addFrameIndex(FrameIndex)           // vaddr
+      .addReg(MFI->getStackPtrOffsetReg()) // scratch_offset
+      .addImm(0)                           // offset
+      .addMemOperand(MMO);
 }
 
 void SIInstrInfo::insertNoop(MachineBasicBlock &MBB,
@@ -2358,8 +2395,6 @@ void SIInstrInfo::insertIndirectBranch(MachineBasicBlock &MBB,
   OffsetLo->setVariableValue(MCBinaryExpr::createAnd(Offset, Mask, MCCtx));
   auto *ShAmt = MCConstantExpr::create(32, MCCtx);
   OffsetHi->setVariableValue(MCBinaryExpr::createAShr(Offset, ShAmt, MCCtx));
-
-  return;
 }
 
 unsigned SIInstrInfo::getBranchOpcode(SIInstrInfo::BranchPredicate Cond) {
@@ -3106,23 +3141,26 @@ bool SIInstrInfo::areMemAccessesTriviallyDisjoint(const MachineInstr &MIa,
 }
 
 static bool getFoldableImm(Register Reg, const MachineRegisterInfo &MRI,
-                           int64_t &Imm) {
+                           int64_t &Imm, MachineInstr **DefMI = nullptr) {
   if (Reg.isPhysical())
     return false;
   auto *Def = MRI.getUniqueVRegDef(Reg);
   if (Def && SIInstrInfo::isFoldableCopy(*Def) && Def->getOperand(1).isImm()) {
     Imm = Def->getOperand(1).getImm();
+    if (DefMI)
+      *DefMI = Def;
     return true;
   }
   return false;
 }
 
-static bool getFoldableImm(const MachineOperand *MO, int64_t &Imm) {
+static bool getFoldableImm(const MachineOperand *MO, int64_t &Imm,
+                           MachineInstr **DefMI = nullptr) {
   if (!MO->isReg())
     return false;
   const MachineFunction *MF = MO->getParent()->getParent()->getParent();
   const MachineRegisterInfo &MRI = MF->getRegInfo();
-  return getFoldableImm(MO->getReg(), MRI, Imm);
+  return getFoldableImm(MO->getReg(), MRI, Imm, DefMI);
 }
 
 static void updateLiveVariables(LiveVariables *LV, MachineInstr &MI,
@@ -3195,8 +3233,20 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineInstr &MI,
       // If we have an SGPR input, we will violate the constant bus restriction.
       (ST.getConstantBusLimit(Opc) > 1 || !Src0->isReg() ||
        !RI.isSGPRReg(MBB.getParent()->getRegInfo(), Src0->getReg()))) {
+    MachineInstr *DefMI;
+    const auto killDef = [&DefMI, &MBB, this]() -> void {
+      const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
+      // The only user is the instruction which will be killed.
+      if (!MRI.hasOneNonDBGUse(DefMI->getOperand(0).getReg()))
+        return;
+      // We cannot just remove the DefMI here, calling pass will crash.
+      DefMI->setDesc(get(AMDGPU::IMPLICIT_DEF));
+      for (unsigned I = DefMI->getNumOperands() - 1; I != 0; --I)
+        DefMI->RemoveOperand(I);
+    };
+
     int64_t Imm;
-    if (getFoldableImm(Src2, Imm)) {
+    if (getFoldableImm(Src2, Imm, &DefMI)) {
       unsigned NewOpc =
           IsFMA ? (IsF16 ? AMDGPU::V_FMAAK_F16 : AMDGPU::V_FMAAK_F32)
                 : (IsF16 ? AMDGPU::V_MADAK_F16 : AMDGPU::V_MADAK_F32);
@@ -3209,13 +3259,14 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineInstr &MI,
         updateLiveVariables(LV, MI, *MIB);
         if (LIS)
           LIS->ReplaceMachineInstrInMaps(MI, *MIB);
+        killDef();
         return MIB;
       }
     }
     unsigned NewOpc = IsFMA
                           ? (IsF16 ? AMDGPU::V_FMAMK_F16 : AMDGPU::V_FMAMK_F32)
                           : (IsF16 ? AMDGPU::V_MADMK_F16 : AMDGPU::V_MADMK_F32);
-    if (getFoldableImm(Src1, Imm)) {
+    if (getFoldableImm(Src1, Imm, &DefMI)) {
       if (pseudoToMCOpcode(NewOpc) != -1) {
         MIB = BuildMI(MBB, MI, MI.getDebugLoc(), get(NewOpc))
                   .add(*Dst)
@@ -3225,10 +3276,11 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineInstr &MI,
         updateLiveVariables(LV, MI, *MIB);
         if (LIS)
           LIS->ReplaceMachineInstrInMaps(MI, *MIB);
+        killDef();
         return MIB;
       }
     }
-    if (getFoldableImm(Src0, Imm)) {
+    if (getFoldableImm(Src0, Imm, &DefMI)) {
       if (pseudoToMCOpcode(NewOpc) != -1 &&
           isOperandLegal(
               MI, AMDGPU::getNamedOperandIdx(NewOpc, AMDGPU::OpName::src0),
@@ -3241,12 +3293,13 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineInstr &MI,
         updateLiveVariables(LV, MI, *MIB);
         if (LIS)
           LIS->ReplaceMachineInstrInMaps(MI, *MIB);
+        killDef();
         return MIB;
       }
     }
   }
 
-  unsigned NewOpc = IsFMA ? (IsF16 ? AMDGPU::V_FMA_F16_e64
+  unsigned NewOpc = IsFMA ? (IsF16 ? AMDGPU::V_FMA_F16_gfx9_e64
                                    : IsF64 ? AMDGPU::V_FMA_F64_e64
                                            : AMDGPU::V_FMA_F32_e64)
                           : (IsF16 ? AMDGPU::V_MAD_F16_e64 : AMDGPU::V_MAD_F32_e64);
@@ -3605,12 +3658,6 @@ bool SIInstrInfo::canShrink(const MachineInstr &MI,
                             const MachineRegisterInfo &MRI) const {
   const MachineOperand *Src2 = getNamedOperand(MI, AMDGPU::OpName::src2);
   // Can't shrink instruction with three operands.
-  // FIXME: v_cndmask_b32 has 3 operands and is shrinkable, but we need to add
-  // a special case for it.  It can only be shrunk if the third operand
-  // is vcc, and src0_modifiers and src1_modifiers are not set.
-  // We should handle this the same way we handle vopc, by addding
-  // a register allocation hint pre-regalloc and then do the shrinking
-  // post-regalloc.
   if (Src2) {
     switch (MI.getOpcode()) {
       default: return false;
@@ -4563,8 +4610,9 @@ static unsigned adjustAllocatableRegClass(const GCNSubtarget &ST,
                                           unsigned RCID,
                                           bool IsAllocatable) {
   if ((IsAllocatable || !ST.hasGFX90AInsts() || !MRI.reservedRegsFrozen()) &&
-      (TID.mayLoad() || TID.mayStore() ||
-      (TID.TSFlags & (SIInstrFlags::DS | SIInstrFlags::MIMG)))) {
+      (((TID.mayLoad() || TID.mayStore()) &&
+        !(TID.TSFlags & SIInstrFlags::VGPRSpill)) ||
+       (TID.TSFlags & (SIInstrFlags::DS | SIInstrFlags::MIMG)))) {
     switch (RCID) {
     case AMDGPU::AV_32RegClassID: return AMDGPU::VGPR_32RegClassID;
     case AMDGPU::AV_64RegClassID: return AMDGPU::VReg_64RegClassID;
@@ -5001,8 +5049,7 @@ void SIInstrInfo::legalizeOperandsVOP3(MachineRegisterInfo &MRI,
     --ConstantBusLimit;
   }
 
-  for (unsigned i = 0; i < 3; ++i) {
-    int Idx = VOP3Idx[i];
+  for (int Idx : VOP3Idx) {
     if (Idx == -1)
       break;
     MachineOperand &MO = MI.getOperand(Idx);
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInstrInfo.td
index 47ee83eb9351..dda92d3d25ff 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInstrInfo.td
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInstrInfo.td
@@ -1350,11 +1350,11 @@ def PackedI16InputMods : PackedIntInputMods<PackedI16InputModsMatchClass>;
 // Complex patterns
 //===----------------------------------------------------------------------===//
 
-def DS1Addr1Offset : ComplexPattern<i32, 2, "SelectDS1Addr1Offset">;
-def DS64Bit4ByteAligned : ComplexPattern<i32, 3, "SelectDS64Bit4ByteAligned">;
-def DS128Bit8ByteAligned : ComplexPattern<i64, 3, "SelectDS128Bit8ByteAligned">;
+def DS1Addr1Offset : ComplexPattern<iPTR, 2, "SelectDS1Addr1Offset">;
+def DS64Bit4ByteAligned : ComplexPattern<iPTR, 3, "SelectDS64Bit4ByteAligned">;
+def DS128Bit8ByteAligned : ComplexPattern<iPTR, 3, "SelectDS128Bit8ByteAligned">;
 
-def MOVRELOffset : ComplexPattern<i32, 2, "SelectMOVRELOffset">;
+def MOVRELOffset : ComplexPattern<iPTR, 2, "SelectMOVRELOffset">;
 
 def VOP3Mods0 : ComplexPattern<untyped, 4, "SelectVOP3Mods0">;
 def VOP3Mods  : ComplexPattern<untyped, 2, "SelectVOP3Mods">;
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInstructions.td b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInstructions.td
index d55d8da8699a..636337ede000 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -761,6 +761,17 @@ defm SI_SPILL_A256 : SI_SPILL_VGPR <AReg_256, 1>;
 defm SI_SPILL_A512 : SI_SPILL_VGPR <AReg_512, 1>;
 defm SI_SPILL_A1024 : SI_SPILL_VGPR <AReg_1024, 1>;
 
+defm SI_SPILL_AV32  : SI_SPILL_VGPR <AV_32, 1>;
+defm SI_SPILL_AV64  : SI_SPILL_VGPR <AV_64, 1>;
+defm SI_SPILL_AV96  : SI_SPILL_VGPR <AV_96, 1>;
+defm SI_SPILL_AV128 : SI_SPILL_VGPR <AV_128, 1>;
+defm SI_SPILL_AV160 : SI_SPILL_VGPR <AV_160, 1>;
+defm SI_SPILL_AV192 : SI_SPILL_VGPR <AV_192, 1>;
+defm SI_SPILL_AV224 : SI_SPILL_VGPR <AV_224, 1>;
+defm SI_SPILL_AV256 : SI_SPILL_VGPR <AV_256, 1>;
+defm SI_SPILL_AV512 : SI_SPILL_VGPR <AV_512, 1>;
+defm SI_SPILL_AV1024 : SI_SPILL_VGPR <AV_1024, 1>;
+
 def SI_PC_ADD_REL_OFFSET : SPseudoInstSI <
   (outs SReg_64:$dst),
   (ins si_ga:$ptr_lo, si_ga:$ptr_hi),
@@ -2106,6 +2117,19 @@ def : GCNPat <
 } // end isWave32
 
 def : GCNPat <
+  (i32 (DivergentBinFrag<xor> i32:$src0, (i32 -1))),
+  (V_NOT_B32_e32 $src0)
+>;
+
+def : GCNPat <
+  (i64 (DivergentBinFrag<xor> i64:$src0, (i64 -1))),
+    (REG_SEQUENCE VReg_64,
+      (V_NOT_B32_e32 (i32 (EXTRACT_SUBREG i64:$src0, sub0))), sub0,
+      (V_NOT_B32_e32 (i32 (EXTRACT_SUBREG i64:$src0, sub1))), sub1
+    )
+>;
+
+def : GCNPat <
   (f16 (sint_to_fp i1:$src)),
   (V_CVT_F16_F32_e32 (
       V_CNDMASK_B32_e64 /*src0mod*/(i32 0), /*src0*/(i32 0),
@@ -2188,18 +2212,18 @@ def : GCNPat <
 >;
 
 def : GCNPat <
-  (i1 (trunc i32:$a)),
-  (V_CMP_EQ_U32_e64 (S_AND_B32 (i32 1), $a), (i32 1))
+  (i1 (DivergentUnaryFrag<trunc> i32:$a)),
+  (V_CMP_EQ_U32_e64 (V_AND_B32_e64 (i32 1), $a), (i32 1))
 >;
 
 def : GCNPat <
-  (i1 (trunc i16:$a)),
-  (V_CMP_EQ_U32_e64 (S_AND_B32 (i32 1), $a), (i32 1))
+  (i1 (DivergentUnaryFrag<trunc> i16:$a)),
+  (V_CMP_EQ_U32_e64 (V_AND_B32_e64 (i32 1), $a), (i32 1))
 >;
 
 def : GCNPat <
-  (i1 (trunc i64:$a)),
-  (V_CMP_EQ_U32_e64 (S_AND_B32 (i32 1),
+  (i1 (DivergentUnaryFrag<trunc> i64:$a)),
+  (V_CMP_EQ_U32_e64 (V_AND_B32_e64 (i32 1),
                     (i32 (EXTRACT_SUBREG $a, sub0))), (i32 1))
 >;
 
@@ -2405,21 +2429,37 @@ def : GCNPat <
 // COPY is workaround tablegen bug from multiple outputs
 // from S_LSHL_B32's multiple outputs from implicit scc def.
 def : GCNPat <
-  (v2i16 (build_vector (i16 0), (i16 SReg_32:$src1))),
+  (v2i16 (UniformBinFrag<build_vector> (i16 0), (i16 SReg_32:$src1))),
   (S_LSHL_B32 SReg_32:$src1, (i16 16))
 >;
 
 def : GCNPat <
-  (v2i16 (build_vector (i16 SReg_32:$src1), (i16 0))),
+  (v2i16 (DivergentBinFrag<build_vector> (i16 0), (i16 SReg_32:$src1))),
+  (v2i16 (V_LSHLREV_B32_e64 (i16 16), SReg_32:$src1))
+>;
+
+
+def : GCNPat <
+  (v2i16 (UniformBinFrag<build_vector> (i16 SReg_32:$src1), (i16 0))),
   (S_AND_B32 (S_MOV_B32 (i32 0xffff)), SReg_32:$src1)
 >;
 
 def : GCNPat <
-  (v2f16 (build_vector (f16 SReg_32:$src1), (f16 FP_ZERO))),
+  (v2i16 (DivergentBinFrag<build_vector> (i16 SReg_32:$src1), (i16 0))),
+  (v2i16 (V_AND_B32_e64 (i32 (V_MOV_B32_e32 (i32 0xffff))), SReg_32:$src1))
+>;
+
+def : GCNPat <
+  (v2f16 (UniformBinFrag<build_vector> (f16 SReg_32:$src1), (f16 FP_ZERO))),
   (S_AND_B32 (S_MOV_B32 (i32 0xffff)), SReg_32:$src1)
 >;
 
 def : GCNPat <
+  (v2f16 (DivergentBinFrag<build_vector> (f16 SReg_32:$src1), (f16 FP_ZERO))),
+  (v2f16 (V_AND_B32_e64 (i32 (V_MOV_B32_e32 (i32 0xffff))), SReg_32:$src1))
+>;
+
+def : GCNPat <
   (v2i16 (build_vector (i16 SReg_32:$src0), (i16 undef))),
   (COPY_TO_REGCLASS SReg_32:$src0, SReg_32)
 >;
@@ -2435,42 +2475,74 @@ def : GCNPat <
 >;
 
 def : GCNPat <
-  (v2i16 (build_vector (i16 undef), (i16 SReg_32:$src1))),
+  (v2i16 (UniformBinFrag<build_vector> (i16 undef), (i16 SReg_32:$src1))),
   (S_LSHL_B32 SReg_32:$src1, (i32 16))
 >;
 
 def : GCNPat <
-  (v2f16 (build_vector (f16 undef), (f16 SReg_32:$src1))),
+  (v2i16 (DivergentBinFrag<build_vector> (i16 undef), (i16 SReg_32:$src1))),
+  (v2i16 (V_LSHLREV_B32_e64 (i32 16), SReg_32:$src1))
+>;
+
+
+def : GCNPat <
+  (v2f16 (UniformBinFrag<build_vector> (f16 undef), (f16 SReg_32:$src1))),
   (S_LSHL_B32 SReg_32:$src1, (i32 16))
 >;
 
+def : GCNPat <
+  (v2f16 (DivergentBinFrag<build_vector> (f16 undef), (f16 SReg_32:$src1))),
+  (v2f16 (V_LSHLREV_B32_e64 (i32 16), SReg_32:$src1))
+>;
+
 let SubtargetPredicate = HasVOP3PInsts in {
 def : GCNPat <
-  (v2i16 (build_vector (i16 SReg_32:$src0), (i16 SReg_32:$src1))),
+  (v2i16 (UniformBinFrag<build_vector> (i16 SReg_32:$src0), (i16 SReg_32:$src1))),
   (S_PACK_LL_B32_B16 SReg_32:$src0, SReg_32:$src1)
 >;
 
+def : GCNPat <
+  (v2i16 (DivergentBinFrag<build_vector> (i16 SReg_32:$src0), (i16 SReg_32:$src1))),
+  (v2i16 (V_LSHL_OR_B32_e64 $src1, (i32 16), (i32 (V_AND_B32_e64 (i32 (V_MOV_B32_e32 (i32 0xffff))), $src0))))
+>;
+
 // With multiple uses of the shift, this will duplicate the shift and
 // increase register pressure.
 def : GCNPat <
-  (v2i16 (build_vector (i16 SReg_32:$src0), (i16 (trunc (srl_oneuse SReg_32:$src1, (i32 16)))))),
+  (v2i16 (UniformBinFrag<build_vector> (i16 SReg_32:$src0), (i16 (trunc (srl_oneuse SReg_32:$src1, (i32 16)))))),
   (v2i16 (S_PACK_LH_B32_B16 SReg_32:$src0, SReg_32:$src1))
 >;
 
+def : GCNPat <
+  (v2i16 (DivergentBinFrag<build_vector> (i16 SReg_32:$src0), (i16 (trunc (srl_oneuse SReg_32:$src1, (i32 16)))))),
+  (v2i16 (V_BFI_B32_e64 (i32 (V_MOV_B32_e32 (i32 0xffff))), SReg_32:$src0, SReg_32:$src1))
+>;
+
 
 def : GCNPat <
-  (v2i16 (build_vector (i16 (trunc (srl_oneuse SReg_32:$src0, (i32 16)))),
+  (v2i16 (UniformBinFrag<build_vector> (i16 (trunc (srl_oneuse SReg_32:$src0, (i32 16)))),
                        (i16 (trunc (srl_oneuse SReg_32:$src1, (i32 16)))))),
   (S_PACK_HH_B32_B16 SReg_32:$src0, SReg_32:$src1)
 >;
 
-// TODO: Should source modifiers be matched to v_pack_b32_f16?
 def : GCNPat <
-  (v2f16 (build_vector (f16 SReg_32:$src0), (f16 SReg_32:$src1))),
+  (v2i16 (DivergentBinFrag<build_vector> (i16 (trunc (srl_oneuse SReg_32:$src0, (i32 16)))),
+                       (i16 (trunc (srl_oneuse SReg_32:$src1, (i32 16)))))),
+  (v2i16 (V_AND_OR_B32_e64 SReg_32:$src1, (i32 (V_MOV_B32_e32 (i32 0xffff0000))), (i32 (V_LSHRREV_B32_e64 (i32 16), SReg_32:$src0))))
+>;
+
+def : GCNPat <
+  (v2f16 (UniformBinFrag<build_vector> (f16 SReg_32:$src0), (f16 SReg_32:$src1))),
   (S_PACK_LL_B32_B16 SReg_32:$src0, SReg_32:$src1)
 >;
 
 def : GCNPat <
+  (v2f16 (DivergentBinFrag<build_vector> (f16 SReg_32:$src0), (f16 SReg_32:$src1))),
+  (v2f16 (V_LSHL_OR_B32_e64 SReg_32:$src1, (i32 16), (i32 (V_AND_B32_e64 (i32 (V_MOV_B32_e32 (i32 0xffff))), SReg_32:$src0))))
+>;
+
+
+def : GCNPat <
   (v2f16 (is_canonicalized<build_vector> (f16 (VOP3Mods (f16 VGPR_32:$src0), i32:$src0_mods)),
                                          (f16 (VOP3Mods (f16 VGPR_32:$src1), i32:$src1_mods)))),
   (V_PACK_B32_F16_e64 $src0_mods, VGPR_32:$src0, $src1_mods, VGPR_32:$src1)
@@ -2866,6 +2938,18 @@ def G_AMDGPU_UMED3 : AMDGPUGenericInstruction {
   let hasSideEffects = 0;
 }
 
+def G_AMDGPU_FMED3 : AMDGPUGenericInstruction {
+  let OutOperandList = (outs type0:$dst);
+  let InOperandList = (ins type0:$src0, type0:$src1, type0:$src2);
+  let hasSideEffects = 0;
+}
+
+def G_AMDGPU_CLAMP : AMDGPUGenericInstruction {
+  let OutOperandList = (outs type0:$dst);
+  let InOperandList = (ins type0:$src);
+  let hasSideEffects = 0;
+}
+
 // Atomic cmpxchg. $cmpval ad $newval are packed in a single vector
 // operand Expects a MachineMemOperand in addition to explicit
 // operands.
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
index c4007f56f350..3ce368ef4db9 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
@@ -62,11 +62,6 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF)
   // calls.
   const bool HasCalls = F.hasFnAttribute("amdgpu-calls");
 
-  // Enable all kernel inputs if we have the fixed ABI. Don't bother if we don't
-  // have any calls.
-  const bool UseFixedABI = AMDGPUTargetMachine::EnableFixedFunctionABI &&
-                           CC != CallingConv::AMDGPU_Gfx &&
-                           (!isEntryFunction() || HasCalls);
   const bool IsKernel = CC == CallingConv::AMDGPU_KERNEL ||
                         CC == CallingConv::SPIR_KERNEL;
 
@@ -80,7 +75,7 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF)
   }
 
   if (!isEntryFunction()) {
-    if (UseFixedABI)
+    if (CC != CallingConv::AMDGPU_Gfx)
       ArgInfo = AMDGPUArgumentUsageInfo::FixedABIFunctionInfo;
 
     // TODO: Pick a high register, and shift down, similar to a kernel.
@@ -110,20 +105,7 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF)
   else if (ST.isMesaGfxShader(F))
     ImplicitBufferPtr = true;
 
-  if (UseFixedABI) {
-    DispatchPtr = true;
-    QueuePtr = true;
-    ImplicitArgPtr = true;
-    WorkGroupIDX = true;
-    WorkGroupIDY = true;
-    WorkGroupIDZ = true;
-    WorkItemIDX = true;
-    WorkItemIDY = true;
-    WorkItemIDZ = true;
-
-    // FIXME: We don't need this?
-    DispatchID = true;
-  } else if (!AMDGPU::isGraphics(CC)) {
+  if (!AMDGPU::isGraphics(CC)) {
     if (IsKernel || !F.hasFnAttribute("amdgpu-no-workgroup-id-x"))
       WorkGroupIDX = true;
 
@@ -462,7 +444,7 @@ void SIMachineFunctionInfo::removeDeadFrameIndices(MachineFrameInfo &MFI) {
       MFI.setStackID(i, TargetStackID::Default);
 
   for (auto &R : VGPRToAGPRSpills) {
-    if (R.second.FullyAllocated)
+    if (R.second.IsDead)
       MFI.RemoveStackObject(R.first);
   }
 }
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
index c305bc20e40d..8accbf611c5f 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
@@ -465,6 +465,7 @@ public:
   struct VGPRSpillToAGPR {
     SmallVector<MCPhysReg, 32> Lanes;
     bool FullyAllocated = false;
+    bool IsDead = false;
   };
 
   // Map WWM VGPR to a stack slot that is used to save/restore it in the
@@ -546,6 +547,12 @@ public:
                                          : I->second.Lanes[Lane];
   }
 
+  void setVGPRToAGPRSpillDead(int FrameIndex) {
+    auto I = VGPRToAGPRSpills.find(FrameIndex);
+    if (I != VGPRToAGPRSpills.end())
+      I->second.IsDead = true;
+  }
+
   bool haveFreeLanesForSGPRSpill(const MachineFunction &MF,
                                  unsigned NumLane) const;
   bool allocateSGPRSpillToVGPR(MachineFunction &MF, int FI);
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIMachineScheduler.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIMachineScheduler.cpp
index 5590d84cc3ab..81db66a98ddf 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIMachineScheduler.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIMachineScheduler.cpp
@@ -869,29 +869,27 @@ void SIScheduleBlockCreator::colorComputeReservedDependencies() {
 }
 
 void SIScheduleBlockCreator::colorAccordingToReservedDependencies() {
-  unsigned DAGSize = DAG->SUnits.size();
   std::map<std::pair<unsigned, unsigned>, unsigned> ColorCombinations;
 
   // Every combination of colors given by the top down
   // and bottom up Reserved node dependency
 
-  for (unsigned i = 0, e = DAGSize; i != e; ++i) {
-    SUnit *SU = &DAG->SUnits[i];
+  for (const SUnit &SU : DAG->SUnits) {
     std::pair<unsigned, unsigned> SUColors;
 
     // High latency instructions: already given.
-    if (CurrentColoring[SU->NodeNum])
+    if (CurrentColoring[SU.NodeNum])
       continue;
 
-    SUColors.first = CurrentTopDownReservedDependencyColoring[SU->NodeNum];
-    SUColors.second = CurrentBottomUpReservedDependencyColoring[SU->NodeNum];
+    SUColors.first = CurrentTopDownReservedDependencyColoring[SU.NodeNum];
+    SUColors.second = CurrentBottomUpReservedDependencyColoring[SU.NodeNum];
 
     std::map<std::pair<unsigned, unsigned>, unsigned>::iterator Pos =
       ColorCombinations.find(SUColors);
     if (Pos != ColorCombinations.end()) {
-      CurrentColoring[SU->NodeNum] = Pos->second;
+      CurrentColoring[SU.NodeNum] = Pos->second;
     } else {
-      CurrentColoring[SU->NodeNum] = NextNonReservedID;
+      CurrentColoring[SU.NodeNum] = NextNonReservedID;
       ColorCombinations[SUColors] = NextNonReservedID++;
     }
   }
@@ -1232,15 +1230,13 @@ void SIScheduleBlockCreator::createBlocksForVariant(SISchedulerBlockCreatorVaria
   }
 
   // Free root and leafs of all blocks to enable scheduling inside them.
-  for (unsigned i = 0, e = CurrentBlocks.size(); i != e; ++i) {
-    SIScheduleBlock *Block = CurrentBlocks[i];
+  for (SIScheduleBlock *Block : CurrentBlocks)
     Block->finalizeUnits();
-  }
-  LLVM_DEBUG(dbgs() << "Blocks created:\n\n";
-             for (unsigned i = 0, e = CurrentBlocks.size(); i != e; ++i) {
-               SIScheduleBlock *Block = CurrentBlocks[i];
-               Block->printDebug(true);
-             });
+  LLVM_DEBUG({
+    dbgs() << "Blocks created:\n\n";
+    for (SIScheduleBlock *Block : CurrentBlocks)
+      Block->printDebug(true);
+  });
 }
 
 // Two functions taken from Codegen/MachineScheduler.cpp
@@ -1379,9 +1375,9 @@ void SIScheduleBlockCreator::scheduleInsideBlocks() {
     }
   }
 
-  LLVM_DEBUG(for (unsigned i = 0, e = CurrentBlocks.size(); i != e; ++i) {
-    SIScheduleBlock *Block = CurrentBlocks[i];
-    Block->printDebug(true);
+  LLVM_DEBUG({
+    for (SIScheduleBlock *Block : CurrentBlocks)
+      Block->printDebug(true);
   });
 }
 
@@ -1437,8 +1433,7 @@ SIScheduleBlockScheduler::SIScheduleBlockScheduler(SIScheduleDAGMI *DAG,
   // found for several parents, we increment the usage of the one with the
   // highest topological index.
   LiveOutRegsNumUsages.resize(Blocks.size());
-  for (unsigned i = 0, e = Blocks.size(); i != e; ++i) {
-    SIScheduleBlock *Block = Blocks[i];
+  for (SIScheduleBlock *Block : Blocks) {
     for (unsigned Reg : Block->getInRegs()) {
       bool Found = false;
       int topoInd = -1;
@@ -1502,8 +1497,7 @@ SIScheduleBlockScheduler::SIScheduleBlockScheduler(SIScheduleDAGMI *DAG,
 
   // Fill LiveRegsConsumers for regs that were already
   // defined before scheduling.
-  for (unsigned i = 0, e = Blocks.size(); i != e; ++i) {
-    SIScheduleBlock *Block = Blocks[i];
+  for (SIScheduleBlock *Block : Blocks) {
     for (unsigned Reg : Block->getInRegs()) {
       bool Found = false;
       for (SIScheduleBlock* Pred: Block->getPreds()) {
@@ -1700,10 +1694,7 @@ void SIScheduleBlockScheduler::blockScheduled(SIScheduleBlock *Block) {
   decreaseLiveRegs(Block, Block->getInRegs());
   addLiveRegs(Block->getOutRegs());
   releaseBlockSuccs(Block);
-  for (std::map<unsigned, unsigned>::iterator RegI =
-       LiveOutRegsNumUsages[Block->getID()].begin(),
-       E = LiveOutRegsNumUsages[Block->getID()].end(); RegI != E; ++RegI) {
-    std::pair<unsigned, unsigned> RegP = *RegI;
+  for (const auto &RegP : LiveOutRegsNumUsages[Block->getID()]) {
     // We produce this register, thus it must not be previously alive.
     assert(LiveRegsConsumers.find(RegP.first) == LiveRegsConsumers.end() ||
            LiveRegsConsumers[RegP.first] == 0);
@@ -1759,8 +1750,7 @@ SIScheduler::scheduleVariant(SISchedulerBlockCreatorVariant BlockVariant,
 
   ScheduledBlocks = Scheduler.getBlocks();
 
-  for (unsigned b = 0; b < ScheduledBlocks.size(); ++b) {
-    SIScheduleBlock *Block = ScheduledBlocks[b];
+  for (SIScheduleBlock *Block : ScheduledBlocks) {
     std::vector<SUnit*> SUs = Block->getScheduledUnits();
 
     for (SUnit* SU : SUs)
@@ -2000,9 +1990,8 @@ void SIScheduleDAGMI::schedule()
   assert(TopRPTracker.getPos() == RegionBegin && "bad initial Top tracker");
   TopRPTracker.setPos(CurrentTop);
 
-  for (std::vector<unsigned>::iterator I = ScheduledSUnits.begin(),
-       E = ScheduledSUnits.end(); I != E; ++I) {
-    SUnit *SU = &SUnits[*I];
+  for (unsigned I : ScheduledSUnits) {
+    SUnit *SU = &SUnits[I];
 
     scheduleMI(SU, true);
 
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
index a1d9a23a5084..21aed4ececb5 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
@@ -210,6 +210,7 @@ struct SGPRSpillBuilder {
       auto I = BuildMI(*MBB, MI, DL, TII.get(NotOpc), ExecReg).addReg(ExecReg);
       if (!TmpVGPRLive)
         I.addReg(TmpVGPR, RegState::ImplicitDefine);
+      I->getOperand(2).setIsDead(true); // Mark SCC as dead.
       TRI.buildVGPRSpillLoadStore(*this, TmpVGPRIndex, 0, /*IsLoad*/ false);
     }
   }
@@ -242,9 +243,10 @@ struct SGPRSpillBuilder {
       TRI.buildVGPRSpillLoadStore(*this, TmpVGPRIndex, 0, /*IsLoad*/ true,
                                   /*IsKill*/ false);
       auto I = BuildMI(*MBB, MI, DL, TII.get(NotOpc), ExecReg).addReg(ExecReg);
-      if (!TmpVGPRLive) {
+      if (!TmpVGPRLive)
         I.addReg(TmpVGPR, RegState::ImplicitKill);
-      }
+      I->getOperand(2).setIsDead(true); // Mark SCC as dead.
+
       // Restore active lanes
       if (TmpVGPRLive)
         TRI.buildVGPRSpillLoadStore(*this, TmpVGPRIndex, 0, /*IsLoad*/ true);
@@ -267,9 +269,11 @@ struct SGPRSpillBuilder {
       TRI.buildVGPRSpillLoadStore(*this, Index, Offset, IsLoad,
                                   /*IsKill*/ false);
       // Spill inactive lanes
-      BuildMI(*MBB, MI, DL, TII.get(NotOpc), ExecReg).addReg(ExecReg);
+      auto Not0 = BuildMI(*MBB, MI, DL, TII.get(NotOpc), ExecReg).addReg(ExecReg);
+      Not0->getOperand(2).setIsDead(); // Mark SCC as dead.
       TRI.buildVGPRSpillLoadStore(*this, Index, Offset, IsLoad);
-      BuildMI(*MBB, MI, DL, TII.get(NotOpc), ExecReg).addReg(ExecReg);
+      auto Not1 = BuildMI(*MBB, MI, DL, TII.get(NotOpc), ExecReg).addReg(ExecReg);
+      Not1->getOperand(2).setIsDead(); // Mark SCC as dead.
     }
   }
 
@@ -908,6 +912,8 @@ static unsigned getNumSubRegsForSpillOp(unsigned Op) {
   case AMDGPU::SI_SPILL_V1024_RESTORE:
   case AMDGPU::SI_SPILL_A1024_SAVE:
   case AMDGPU::SI_SPILL_A1024_RESTORE:
+  case AMDGPU::SI_SPILL_AV1024_SAVE:
+  case AMDGPU::SI_SPILL_AV1024_RESTORE:
     return 32;
   case AMDGPU::SI_SPILL_S512_SAVE:
   case AMDGPU::SI_SPILL_S512_RESTORE:
@@ -915,6 +921,8 @@ static unsigned getNumSubRegsForSpillOp(unsigned Op) {
   case AMDGPU::SI_SPILL_V512_RESTORE:
   case AMDGPU::SI_SPILL_A512_SAVE:
   case AMDGPU::SI_SPILL_A512_RESTORE:
+  case AMDGPU::SI_SPILL_AV512_SAVE:
+  case AMDGPU::SI_SPILL_AV512_RESTORE:
     return 16;
   case AMDGPU::SI_SPILL_S256_SAVE:
   case AMDGPU::SI_SPILL_S256_RESTORE:
@@ -922,6 +930,8 @@ static unsigned getNumSubRegsForSpillOp(unsigned Op) {
   case AMDGPU::SI_SPILL_V256_RESTORE:
   case AMDGPU::SI_SPILL_A256_SAVE:
   case AMDGPU::SI_SPILL_A256_RESTORE:
+  case AMDGPU::SI_SPILL_AV256_SAVE:
+  case AMDGPU::SI_SPILL_AV256_RESTORE:
     return 8;
   case AMDGPU::SI_SPILL_S224_SAVE:
   case AMDGPU::SI_SPILL_S224_RESTORE:
@@ -929,6 +939,8 @@ static unsigned getNumSubRegsForSpillOp(unsigned Op) {
   case AMDGPU::SI_SPILL_V224_RESTORE:
   case AMDGPU::SI_SPILL_A224_SAVE:
   case AMDGPU::SI_SPILL_A224_RESTORE:
+  case AMDGPU::SI_SPILL_AV224_SAVE:
+  case AMDGPU::SI_SPILL_AV224_RESTORE:
     return 7;
   case AMDGPU::SI_SPILL_S192_SAVE:
   case AMDGPU::SI_SPILL_S192_RESTORE:
@@ -936,6 +948,8 @@ static unsigned getNumSubRegsForSpillOp(unsigned Op) {
   case AMDGPU::SI_SPILL_V192_RESTORE:
   case AMDGPU::SI_SPILL_A192_SAVE:
   case AMDGPU::SI_SPILL_A192_RESTORE:
+  case AMDGPU::SI_SPILL_AV192_SAVE:
+  case AMDGPU::SI_SPILL_AV192_RESTORE:
     return 6;
   case AMDGPU::SI_SPILL_S160_SAVE:
   case AMDGPU::SI_SPILL_S160_RESTORE:
@@ -943,6 +957,8 @@ static unsigned getNumSubRegsForSpillOp(unsigned Op) {
   case AMDGPU::SI_SPILL_V160_RESTORE:
   case AMDGPU::SI_SPILL_A160_SAVE:
   case AMDGPU::SI_SPILL_A160_RESTORE:
+  case AMDGPU::SI_SPILL_AV160_SAVE:
+  case AMDGPU::SI_SPILL_AV160_RESTORE:
     return 5;
   case AMDGPU::SI_SPILL_S128_SAVE:
   case AMDGPU::SI_SPILL_S128_RESTORE:
@@ -950,6 +966,8 @@ static unsigned getNumSubRegsForSpillOp(unsigned Op) {
   case AMDGPU::SI_SPILL_V128_RESTORE:
   case AMDGPU::SI_SPILL_A128_SAVE:
   case AMDGPU::SI_SPILL_A128_RESTORE:
+  case AMDGPU::SI_SPILL_AV128_SAVE:
+  case AMDGPU::SI_SPILL_AV128_RESTORE:
     return 4;
   case AMDGPU::SI_SPILL_S96_SAVE:
   case AMDGPU::SI_SPILL_S96_RESTORE:
@@ -957,6 +975,8 @@ static unsigned getNumSubRegsForSpillOp(unsigned Op) {
   case AMDGPU::SI_SPILL_V96_RESTORE:
   case AMDGPU::SI_SPILL_A96_SAVE:
   case AMDGPU::SI_SPILL_A96_RESTORE:
+  case AMDGPU::SI_SPILL_AV96_SAVE:
+  case AMDGPU::SI_SPILL_AV96_RESTORE:
     return 3;
   case AMDGPU::SI_SPILL_S64_SAVE:
   case AMDGPU::SI_SPILL_S64_RESTORE:
@@ -964,6 +984,8 @@ static unsigned getNumSubRegsForSpillOp(unsigned Op) {
   case AMDGPU::SI_SPILL_V64_RESTORE:
   case AMDGPU::SI_SPILL_A64_SAVE:
   case AMDGPU::SI_SPILL_A64_RESTORE:
+  case AMDGPU::SI_SPILL_AV64_SAVE:
+  case AMDGPU::SI_SPILL_AV64_RESTORE:
     return 2;
   case AMDGPU::SI_SPILL_S32_SAVE:
   case AMDGPU::SI_SPILL_S32_RESTORE:
@@ -971,6 +993,8 @@ static unsigned getNumSubRegsForSpillOp(unsigned Op) {
   case AMDGPU::SI_SPILL_V32_RESTORE:
   case AMDGPU::SI_SPILL_A32_SAVE:
   case AMDGPU::SI_SPILL_A32_RESTORE:
+  case AMDGPU::SI_SPILL_AV32_SAVE:
+  case AMDGPU::SI_SPILL_AV32_RESTORE:
     return 1;
   default: llvm_unreachable("Invalid spill opcode");
   }
@@ -1240,9 +1264,10 @@ void SIRegisterInfo::buildSpillLoadStore(
     if (ScratchOffsetReg == AMDGPU::NoRegister) {
       BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_MOV_B32), SOffset).addImm(Offset);
     } else {
-      BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_ADD_I32), SOffset)
+      auto Add = BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_ADD_I32), SOffset)
           .addReg(ScratchOffsetReg)
           .addImm(Offset);
+      Add->getOperand(3).setIsDead(); // Mark SCC as dead.
     }
 
     Offset = 0;
@@ -1810,7 +1835,17 @@ void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
     case AMDGPU::SI_SPILL_A128_SAVE:
     case AMDGPU::SI_SPILL_A96_SAVE:
     case AMDGPU::SI_SPILL_A64_SAVE:
-    case AMDGPU::SI_SPILL_A32_SAVE: {
+    case AMDGPU::SI_SPILL_A32_SAVE:
+    case AMDGPU::SI_SPILL_AV1024_SAVE:
+    case AMDGPU::SI_SPILL_AV512_SAVE:
+    case AMDGPU::SI_SPILL_AV256_SAVE:
+    case AMDGPU::SI_SPILL_AV224_SAVE:
+    case AMDGPU::SI_SPILL_AV192_SAVE:
+    case AMDGPU::SI_SPILL_AV160_SAVE:
+    case AMDGPU::SI_SPILL_AV128_SAVE:
+    case AMDGPU::SI_SPILL_AV96_SAVE:
+    case AMDGPU::SI_SPILL_AV64_SAVE:
+    case AMDGPU::SI_SPILL_AV32_SAVE: {
       const MachineOperand *VData = TII->getNamedOperand(*MI,
                                                          AMDGPU::OpName::vdata);
       assert(TII->getNamedOperand(*MI, AMDGPU::OpName::soffset)->getReg() ==
@@ -1846,7 +1881,17 @@ void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
     case AMDGPU::SI_SPILL_A224_RESTORE:
     case AMDGPU::SI_SPILL_A256_RESTORE:
     case AMDGPU::SI_SPILL_A512_RESTORE:
-    case AMDGPU::SI_SPILL_A1024_RESTORE: {
+    case AMDGPU::SI_SPILL_A1024_RESTORE:
+    case AMDGPU::SI_SPILL_AV32_RESTORE:
+    case AMDGPU::SI_SPILL_AV64_RESTORE:
+    case AMDGPU::SI_SPILL_AV96_RESTORE:
+    case AMDGPU::SI_SPILL_AV128_RESTORE:
+    case AMDGPU::SI_SPILL_AV160_RESTORE:
+    case AMDGPU::SI_SPILL_AV192_RESTORE:
+    case AMDGPU::SI_SPILL_AV224_RESTORE:
+    case AMDGPU::SI_SPILL_AV256_RESTORE:
+    case AMDGPU::SI_SPILL_AV512_RESTORE:
+    case AMDGPU::SI_SPILL_AV1024_RESTORE: {
       const MachineOperand *VData = TII->getNamedOperand(*MI,
                                                          AMDGPU::OpName::vdata);
       assert(TII->getNamedOperand(*MI, AMDGPU::OpName::soffset)->getReg() ==
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp
index 3a372d4519fb..c8f1daf26de9 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp
@@ -731,11 +731,6 @@ bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) {
           continue;
       }
 
-      // getVOPe32 could be -1 here if we started with an instruction that had
-      // a 32-bit encoding and then commuted it to an instruction that did not.
-      if (!TII->hasVALU32BitEncoding(MI.getOpcode()))
-        continue;
-
       int Op32 = AMDGPU::getVOPe32(MI.getOpcode());
 
       if (TII->isVOPC(Op32)) {
@@ -776,10 +771,6 @@ bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) {
       const MachineOperand *SDst = TII->getNamedOperand(MI,
                                                         AMDGPU::OpName::sdst);
 
-      // Check the carry-in operand for v_addc_u32_e64.
-      const MachineOperand *Src2 = TII->getNamedOperand(MI,
-                                                        AMDGPU::OpName::src2);
-
       if (SDst) {
         bool Next = false;
 
@@ -791,6 +782,8 @@ bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) {
 
         // All of the instructions with carry outs also have an SGPR input in
         // src2.
+        const MachineOperand *Src2 = TII->getNamedOperand(MI,
+                                                          AMDGPU::OpName::src2);
         if (Src2 && Src2->getReg() != VCCReg) {
           if (Src2->getReg().isVirtual())
             MRI.setRegAllocationHint(Src2->getReg(), 0, VCCReg);
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
index 46012e5d7d97..77ee3c0ff0e4 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
@@ -495,11 +495,10 @@ char SIWholeQuadMode::scanInstructions(MachineFunction &MF,
   // instruction as needing e.g. WQM before visiting it and realizing it needs
   // WQM disabled.
   ReversePostOrderTraversal<MachineFunction *> RPOT(&MF);
-  for (auto BI = RPOT.begin(), BE = RPOT.end(); BI != BE; ++BI) {
-    MachineBasicBlock &MBB = **BI;
-    BlockInfo &BBI = Blocks[&MBB];
+  for (MachineBasicBlock *MBB : RPOT) {
+    BlockInfo &BBI = Blocks[MBB];
 
-    for (MachineInstr &MI : MBB) {
+    for (MachineInstr &MI : *MBB) {
       InstrInfo &III = Instructions[&MI];
       unsigned Opcode = MI.getOpcode();
       char Flags = 0;
@@ -561,7 +560,7 @@ char SIWholeQuadMode::scanInstructions(MachineFunction &MF,
         BBI.Needs |= StateExact;
         if (!(BBI.InNeeds & StateExact)) {
           BBI.InNeeds |= StateExact;
-          Worklist.push_back(&MBB);
+          Worklist.push_back(MBB);
         }
         GlobalFlags |= StateExact;
         III.Disabled = StateWQM | StateStrict;
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SMInstructions.td b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SMInstructions.td
index 8502ed61b366..184c871db775 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SMInstructions.td
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SMInstructions.td
@@ -181,15 +181,8 @@ class SM_Time_Pseudo<string opName, SDPatternOperator node = null_frag> : SM_Pse
   " $sdst", [(set i64:$sdst, (node))]> {
   let hasSideEffects = 1;
 
-  // FIXME: This should be definitively mayStore = 0. TableGen
-  // brokenly tries to infer these based on the intrinsic properties
-  // corresponding to the IR attributes. The target intrinsics are
-  // considered as writing to memory for IR dependency purposes, but
-  // those can be modeled with hasSideEffects here. These also end up
-  // inferring differently for llvm.readcyclecounter and the amdgcn
-  // intrinsics.
-  let mayStore = ?;
-  let mayLoad = 1;
+  let mayStore = 0;
+  let mayLoad = 0;
   let has_sbase = 0;
   let has_offset = 0;
 }
@@ -765,11 +758,11 @@ def smrd_load : PatFrag <(ops node:$ptr), (load node:$ptr), [{ return isUniformL
   }];
 }
 
-def SMRDImm         : ComplexPattern<i64, 2, "SelectSMRDImm">;
-def SMRDImm32       : ComplexPattern<i64, 2, "SelectSMRDImm32">;
-def SMRDSgpr        : ComplexPattern<i64, 2, "SelectSMRDSgpr">;
-def SMRDBufferImm   : ComplexPattern<i32, 1, "SelectSMRDBufferImm">;
-def SMRDBufferImm32 : ComplexPattern<i32, 1, "SelectSMRDBufferImm32">;
+def SMRDImm         : ComplexPattern<iPTR, 2, "SelectSMRDImm">;
+def SMRDImm32       : ComplexPattern<iPTR, 2, "SelectSMRDImm32">;
+def SMRDSgpr        : ComplexPattern<iPTR, 2, "SelectSMRDSgpr">;
+def SMRDBufferImm   : ComplexPattern<iPTR, 1, "SelectSMRDBufferImm">;
+def SMRDBufferImm32 : ComplexPattern<iPTR, 1, "SelectSMRDBufferImm32">;
 
 multiclass SMRD_Pattern <string Instr, ValueType vt> {
 
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SOPInstructions.td b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SOPInstructions.td
index 61ecc13620a1..1713586dcf5b 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SOPInstructions.td
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SOPInstructions.td
@@ -157,6 +157,42 @@ class SOP1_1 <string opName, RegisterClass rc = SReg_64, list<dag> pattern=[]> :
   let has_sdst = 0;
 }
 
+class UniformUnaryFrag<SDPatternOperator Op> : PatFrag <
+  (ops node:$src0),
+  (Op $src0),
+  [{ return !N->isDivergent(); }]> {
+  // This check is unnecessary as it's captured by the result register
+  // bank constraint.
+  //
+  // FIXME: Should add a way for the emitter to recognize this is a
+  // trivially true predicate to eliminate the check.
+  let GISelPredicateCode = [{return true;}];
+}
+
+class UniformBinFrag<SDPatternOperator Op> : PatFrag <
+  (ops node:$src0, node:$src1),
+  (Op $src0, $src1),
+  [{ return !N->isDivergent(); }]> {
+  // This check is unnecessary as it's captured by the result register
+  // bank constraint.
+  //
+  // FIXME: Should add a way for the emitter to recognize this is a
+  // trivially true predicate to eliminate the check.
+  let GISelPredicateCode = [{return true;}];
+}
+
+class DivergentBinFrag<SDPatternOperator Op> : PatFrag <
+  (ops node:$src0, node:$src1),
+  (Op $src0, $src1),
+  [{ return N->isDivergent(); }]> {
+  // This check is unnecessary as it's captured by the result register
+  // bank constraint.
+  //
+  // FIXME: Should add a way for the emitter to recognize this is a
+  // trivially true predicate to eliminate the check.
+  let GISelPredicateCode = [{return true;}];
+}
+
 
 let isMoveImm = 1 in {
   let isReMaterializable = 1, isAsCheapAsAMove = 1 in {
@@ -172,11 +208,11 @@ let isMoveImm = 1 in {
 
 let Defs = [SCC] in {
   def S_NOT_B32 : SOP1_32 <"s_not_b32",
-    [(set i32:$sdst, (not i32:$src0))]
+    [(set i32:$sdst, (UniformUnaryFrag<not> i32:$src0))]
   >;
 
   def S_NOT_B64 : SOP1_64 <"s_not_b64",
-    [(set i64:$sdst, (not i64:$src0))]
+    [(set i64:$sdst, (UniformUnaryFrag<not> i64:$src0))]
   >;
   def S_WQM_B32 : SOP1_32 <"s_wqm_b32">;
   def S_WQM_B64 : SOP1_64 <"s_wqm_b64">;
@@ -221,22 +257,22 @@ let isReMaterializable = 1 in {
 def S_FF0_I32_B32 : SOP1_32 <"s_ff0_i32_b32">;
 def S_FF0_I32_B64 : SOP1_32_64 <"s_ff0_i32_b64">;
 def S_FF1_I32_B64 : SOP1_32_64 <"s_ff1_i32_b64",
-  [(set i32:$sdst, (AMDGPUffbl_b32 i64:$src0))]
+  [(set i32:$sdst, (UniformUnaryFrag<AMDGPUffbl_b32> i64:$src0))]
 >;
 
 def S_FF1_I32_B32 : SOP1_32 <"s_ff1_i32_b32",
-  [(set i32:$sdst, (AMDGPUffbl_b32 i32:$src0))]
+  [(set i32:$sdst, (UniformUnaryFrag<AMDGPUffbl_b32> i32:$src0))]
 >;
 
 def S_FLBIT_I32_B32 : SOP1_32 <"s_flbit_i32_b32",
-  [(set i32:$sdst, (AMDGPUffbh_u32 i32:$src0))]
+  [(set i32:$sdst, (UniformUnaryFrag<AMDGPUffbh_u32> i32:$src0))]
 >;
 
 def S_FLBIT_I32_B64 : SOP1_32_64 <"s_flbit_i32_b64",
-  [(set i32:$sdst, (AMDGPUffbh_u32 i64:$src0))]
+  [(set i32:$sdst, (UniformUnaryFrag<AMDGPUffbh_u32> i64:$src0))]
 >;
 def S_FLBIT_I32 : SOP1_32 <"s_flbit_i32",
-  [(set i32:$sdst, (AMDGPUffbh_i32 i32:$src0))]
+  [(set i32:$sdst, (UniformUnaryFrag<AMDGPUffbh_i32> i32:$src0))]
 >;
 def S_FLBIT_I32_I64 : SOP1_32_64 <"s_flbit_i32_i64">;
 def S_SEXT_I32_I8 : SOP1_32 <"s_sext_i32_i8",
@@ -426,41 +462,6 @@ class SOP2_64_32_32 <string opName, list<dag> pattern=[]> : SOP2_Pseudo <
   "$sdst, $src0, $src1", pattern
 >;
 
-class UniformUnaryFrag<SDPatternOperator Op> : PatFrag <
-  (ops node:$src0),
-  (Op $src0),
-  [{ return !N->isDivergent(); }]> {
-  // This check is unnecessary as it's captured by the result register
-  // bank constraint.
-  //
-  // FIXME: Should add a way for the emitter to recognize this is a
-  // trivially true predicate to eliminate the check.
-  let GISelPredicateCode = [{return true;}];
-}
-
-class UniformBinFrag<SDPatternOperator Op> : PatFrag <
-  (ops node:$src0, node:$src1),
-  (Op $src0, $src1),
-  [{ return !N->isDivergent(); }]> {
-  // This check is unnecessary as it's captured by the result register
-  // bank constraint.
-  //
-  // FIXME: Should add a way for the emitter to recognize this is a
-  // trivially true predicate to eliminate the check.
-  let GISelPredicateCode = [{return true;}];
-}
-
-class DivergentBinFrag<SDPatternOperator Op> : PatFrag <
-  (ops node:$src0, node:$src1),
-  (Op $src0, $src1),
-  [{ return N->isDivergent(); }]> {
-  // This check is unnecessary as it's captured by the result register
-  // bank constraint.
-  //
-  // FIXME: Should add a way for the emitter to recognize this is a
-  // trivially true predicate to eliminate the check.
-  let GISelPredicateCode = [{return true;}];
-}
 
 let Defs = [SCC] in { // Carry out goes to SCC
 let isCommutable = 1 in {
@@ -485,19 +486,18 @@ def S_SUBB_U32 : SOP2_32 <"s_subb_u32",
   [(set i32:$sdst, (UniformBinFrag<sube> (i32 SSrc_b32:$src0), (i32 SSrc_b32:$src1)))]>;
 } // End Uses = [SCC]
 
-
 let isCommutable = 1 in {
 def S_MIN_I32 : SOP2_32 <"s_min_i32",
-  [(set i32:$sdst, (smin i32:$src0, i32:$src1))]
+  [(set i32:$sdst, (UniformBinFrag<smin> i32:$src0, i32:$src1))]
 >;
 def S_MIN_U32 : SOP2_32 <"s_min_u32",
-  [(set i32:$sdst, (umin i32:$src0, i32:$src1))]
+  [(set i32:$sdst, (UniformBinFrag<umin> i32:$src0, i32:$src1))]
 >;
 def S_MAX_I32 : SOP2_32 <"s_max_i32",
-  [(set i32:$sdst, (smax i32:$src0, i32:$src1))]
+  [(set i32:$sdst, (UniformBinFrag<smax> i32:$src0, i32:$src1))]
 >;
 def S_MAX_U32 : SOP2_32 <"s_max_u32",
-  [(set i32:$sdst, (umax i32:$src0, i32:$src1))]
+  [(set i32:$sdst, (UniformBinFrag<umax> i32:$src0, i32:$src1))]
 >;
 } // End isCommutable = 1
 } // End Defs = [SCC]
@@ -870,7 +870,7 @@ def S_GETREG_B32 : SOPK_Pseudo <
 }
 } // End mayLoad = 1
 
-let mayLoad = 0, mayStore = 0, Defs = [MODE], Uses = [MODE] in {
+let Defs = [MODE], Uses = [MODE] in {
 
 // FIXME: Need to truncate immediate to 16-bits.
 class S_SETREG_B32_Pseudo <list<dag> pattern=[]> : SOPK_Pseudo <
@@ -914,7 +914,7 @@ def S_SETREG_IMM32_B32_mode : S_SETREG_IMM32_B32_Pseudo {
   let hasSideEffects = 0;
 }
 
-} // End mayLoad = 0, mayStore = 0, Defs = [MODE], Uses = [MODE]
+} // End Defs = [MODE], Uses = [MODE]
 
 class SOPK_WAITCNT<string opName, list<dag> pat=[]> :
     SOPK_Pseudo<
@@ -1264,7 +1264,7 @@ def S_WAKEUP : SOPP_Pseudo <"s_wakeup", (ins) > {
   let mayStore = 1;
 }
 
-let mayLoad = 0, mayStore = 0, hasSideEffects = 1 in
+let hasSideEffects = 1 in
 def S_WAITCNT : SOPP_Pseudo <"s_waitcnt" , (ins WAIT_FLAG:$simm16), "$simm16",
     [(int_amdgcn_s_waitcnt timm:$simm16)]>;
 def S_SETHALT : SOPP_Pseudo <"s_sethalt" , (ins i32imm:$simm16), "$simm16",
@@ -1278,8 +1278,6 @@ def S_SETKILL : SOPP_Pseudo <"s_setkill" , (ins i16imm:$simm16), "$simm16">;
 def S_SLEEP : SOPP_Pseudo <"s_sleep", (ins i32imm:$simm16),
   "$simm16", [(int_amdgcn_s_sleep timm:$simm16)]> {
   let hasSideEffects = 1;
-  let mayLoad = 0;
-  let mayStore = 0;
 }
 
 def S_SETPRIO : SOPP_Pseudo <"s_setprio" , (ins i16imm:$simm16), "$simm16">;
@@ -1305,14 +1303,10 @@ def S_ICACHE_INV : SOPP_Pseudo <"s_icache_inv", (ins)> {
 def S_INCPERFLEVEL : SOPP_Pseudo <"s_incperflevel", (ins i32imm:$simm16), "$simm16",
   [(int_amdgcn_s_incperflevel timm:$simm16)]> {
   let hasSideEffects = 1;
-  let mayLoad = 0;
-  let mayStore = 0;
 }
 def S_DECPERFLEVEL : SOPP_Pseudo <"s_decperflevel", (ins i32imm:$simm16), "$simm16",
   [(int_amdgcn_s_decperflevel timm:$simm16)]> {
   let hasSideEffects = 1;
-  let mayLoad = 0;
-  let mayStore = 0;
 }
 def S_TTRACEDATA : SOPP_Pseudo <"s_ttracedata", (ins)> {
   let simm16 = 0;
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/Utils/AMDGPULDSUtils.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/Utils/AMDGPULDSUtils.cpp
index 2e4d83fbbc39..a83ff6667956 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/Utils/AMDGPULDSUtils.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/Utils/AMDGPULDSUtils.cpp
@@ -15,7 +15,6 @@
 #include "Utils/AMDGPUBaseInfo.h"
 #include "llvm/ADT/DepthFirstIterator.h"
 #include "llvm/ADT/SetVector.h"
-#include "llvm/Analysis/CallGraph.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/ReplaceConstant.h"
 
@@ -25,175 +24,6 @@ namespace llvm {
 
 namespace AMDGPU {
 
-// An helper class for collecting all reachable callees for each kernel defined
-// within the module.
-class CollectReachableCallees {
-  Module &M;
-  CallGraph CG;
-  SmallPtrSet<CallGraphNode *, 8> AddressTakenFunctions;
-
-  // Collect all address taken functions within the module.
-  void collectAddressTakenFunctions() {
-    auto *ECNode = CG.getExternalCallingNode();
-
-    for (auto GI = ECNode->begin(), GE = ECNode->end(); GI != GE; ++GI) {
-      auto *CGN = GI->second;
-      auto *F = CGN->getFunction();
-      if (!F || F->isDeclaration() || AMDGPU::isKernelCC(F))
-        continue;
-      AddressTakenFunctions.insert(CGN);
-    }
-  }
-
-  // For given kernel, collect all its reachable non-kernel functions.
-  SmallPtrSet<Function *, 8> collectReachableCallees(Function *K) {
-    SmallPtrSet<Function *, 8> ReachableCallees;
-
-    // Call graph node which represents this kernel.
-    auto *KCGN = CG[K];
-
-    // Go through all call graph nodes reachable from the node representing this
-    // kernel, visit all their call sites, if the call site is direct, add
-    // corresponding callee to reachable callee set, if it is indirect, resolve
-    // the indirect call site to potential reachable callees, add them to
-    // reachable callee set, and repeat the process for the newly added
-    // potential callee nodes.
-    //
-    // FIXME: Need to handle bit-casted function pointers.
-    //
-    SmallVector<CallGraphNode *, 8> CGNStack(df_begin(KCGN), df_end(KCGN));
-    SmallPtrSet<CallGraphNode *, 8> VisitedCGNodes;
-    while (!CGNStack.empty()) {
-      auto *CGN = CGNStack.pop_back_val();
-
-      if (!VisitedCGNodes.insert(CGN).second)
-        continue;
-
-      // Ignore call graph node which does not have associated function or
-      // associated function is not a definition.
-      if (!CGN->getFunction() || CGN->getFunction()->isDeclaration())
-        continue;
-
-      for (auto GI = CGN->begin(), GE = CGN->end(); GI != GE; ++GI) {
-        auto *RCB = cast<CallBase>(GI->first.getValue());
-        auto *RCGN = GI->second;
-
-        if (auto *DCallee = RCGN->getFunction()) {
-          ReachableCallees.insert(DCallee);
-        } else if (RCB->isIndirectCall()) {
-          auto *RCBFTy = RCB->getFunctionType();
-          for (auto *ACGN : AddressTakenFunctions) {
-            auto *ACallee = ACGN->getFunction();
-            if (ACallee->getFunctionType() == RCBFTy) {
-              ReachableCallees.insert(ACallee);
-              CGNStack.append(df_begin(ACGN), df_end(ACGN));
-            }
-          }
-        }
-      }
-    }
-
-    return ReachableCallees;
-  }
-
-public:
-  explicit CollectReachableCallees(Module &M) : M(M), CG(CallGraph(M)) {
-    // Collect address taken functions.
-    collectAddressTakenFunctions();
-  }
-
-  void collectReachableCallees(
-      DenseMap<Function *, SmallPtrSet<Function *, 8>> &KernelToCallees) {
-    // Collect reachable callee set for each kernel defined in the module.
-    for (Function &F : M.functions()) {
-      if (!AMDGPU::isKernelCC(&F))
-        continue;
-      Function *K = &F;
-      KernelToCallees[K] = collectReachableCallees(K);
-    }
-  }
-};
-
-void collectReachableCallees(
-    Module &M,
-    DenseMap<Function *, SmallPtrSet<Function *, 8>> &KernelToCallees) {
-  CollectReachableCallees CRC{M};
-  CRC.collectReachableCallees(KernelToCallees);
-}
-
-SmallPtrSet<Function *, 8> collectNonKernelAccessorsOfLDS(GlobalVariable *GV) {
-  SmallPtrSet<Function *, 8> LDSAccessors;
-  SmallVector<User *, 8> UserStack(GV->users());
-  SmallPtrSet<User *, 8> VisitedUsers;
-
-  while (!UserStack.empty()) {
-    auto *U = UserStack.pop_back_val();
-
-    // `U` is already visited? continue to next one.
-    if (!VisitedUsers.insert(U).second)
-      continue;
-
-    // `U` is a global variable which is initialized with LDS. Ignore LDS.
-    if (isa<GlobalValue>(U))
-      return SmallPtrSet<Function *, 8>();
-
-    // Recursively explore constant users.
-    if (isa<Constant>(U)) {
-      append_range(UserStack, U->users());
-      continue;
-    }
-
-    // `U` should be an instruction, if it belongs to a non-kernel function F,
-    // then collect F.
-    Function *F = cast<Instruction>(U)->getFunction();
-    if (!AMDGPU::isKernelCC(F))
-      LDSAccessors.insert(F);
-  }
-
-  return LDSAccessors;
-}
-
-DenseMap<Function *, SmallPtrSet<Instruction *, 8>>
-getFunctionToInstsMap(User *U, bool CollectKernelInsts) {
-  DenseMap<Function *, SmallPtrSet<Instruction *, 8>> FunctionToInsts;
-  SmallVector<User *, 8> UserStack;
-  SmallPtrSet<User *, 8> VisitedUsers;
-
-  UserStack.push_back(U);
-
-  while (!UserStack.empty()) {
-    auto *UU = UserStack.pop_back_val();
-
-    if (!VisitedUsers.insert(UU).second)
-      continue;
-
-    if (isa<GlobalValue>(UU))
-      continue;
-
-    if (isa<Constant>(UU)) {
-      append_range(UserStack, UU->users());
-      continue;
-    }
-
-    auto *I = cast<Instruction>(UU);
-    Function *F = I->getFunction();
-    if (CollectKernelInsts) {
-      if (!AMDGPU::isKernelCC(F)) {
-        continue;
-      }
-    } else {
-      if (AMDGPU::isKernelCC(F)) {
-        continue;
-      }
-    }
-
-    FunctionToInsts.insert(std::make_pair(F, SmallPtrSet<Instruction *, 8>()));
-    FunctionToInsts[F].insert(I);
-  }
-
-  return FunctionToInsts;
-}
-
 bool isKernelCC(const Function *Func) {
   return AMDGPU::isModuleEntryFunctionCC(Func->getCallingConv());
 }
@@ -232,26 +62,8 @@ void replaceConstantUsesInFunction(ConstantExpr *C, const Function *F) {
   }
 }
 
-bool hasUserInstruction(const GlobalValue *GV) {
-  SmallPtrSet<const User *, 8> Visited;
-  SmallVector<const User *, 16> Stack(GV->users());
-
-  while (!Stack.empty()) {
-    const User *U = Stack.pop_back_val();
-
-    if (!Visited.insert(U).second)
-      continue;
-
-    if (isa<Instruction>(U))
-      return true;
-
-    append_range(Stack, U->users());
-  }
-
-  return false;
-}
-
-bool shouldLowerLDSToStruct(const GlobalVariable &GV, const Function *F) {
+static bool shouldLowerLDSToStruct(const GlobalVariable &GV,
+                                   const Function *F) {
   // We are not interested in kernel LDS lowering for module LDS itself.
   if (F && GV.getName() == "llvm.amdgcn.module.lds")
     return false;
@@ -259,7 +71,6 @@ bool shouldLowerLDSToStruct(const GlobalVariable &GV, const Function *F) {
   bool Ret = false;
   SmallPtrSet<const User *, 8> Visited;
   SmallVector<const User *, 16> Stack(GV.users());
-  SmallPtrSet<const GlobalValue *, 8> GlobalUsers;
 
   assert(!F || isKernelCC(F));
 
@@ -267,15 +78,10 @@ bool shouldLowerLDSToStruct(const GlobalVariable &GV, const Function *F) {
     const User *V = Stack.pop_back_val();
     Visited.insert(V);
 
-    if (auto *G = dyn_cast<GlobalValue>(V)) {
-      StringRef GName = G->getName();
-      if (F && GName != "llvm.used" && GName != "llvm.compiler.used") {
-        // For kernel LDS lowering, if G is not a compiler.used list, then we
-        // cannot lower the lds GV since we cannot replace the use of GV within
-        // G.
-        return false;
-      }
-      GlobalUsers.insert(G);
+    if (isa<GlobalValue>(V)) {
+      // This use of the LDS variable is the initializer of a global variable.
+      // This is ill formed. The address of an LDS variable is kernel dependent
+      // and unknown until runtime. It can't be written to a global variable.
       continue;
     }
 
@@ -297,15 +103,6 @@ bool shouldLowerLDSToStruct(const GlobalVariable &GV, const Function *F) {
     append_range(Stack, V->users());
   }
 
-  if (!F && !Ret) {
-    // For module LDS lowering, we have not yet decided if we should lower GV or
-    // not. Explore all global users of GV, and check if atleast one of these
-    // global users appear as an use within an instruction (possibly nested use
-    // via constant expression), if so, then conservately lower LDS.
-    for (auto *G : GlobalUsers)
-      Ret |= hasUserInstruction(G);
-  }
-
   return Ret;
 }
 
@@ -324,7 +121,7 @@ std::vector<GlobalVariable *> findVariablesToLower(Module &M,
       continue;
     }
     if (!isa<UndefValue>(GV.getInitializer())) {
-      // Initializers are unimplemented for local address space.
+      // Initializers are unimplemented for LDS address space.
       // Leave such variables in place for consistent error reporting.
       continue;
     }
@@ -342,20 +139,6 @@ std::vector<GlobalVariable *> findVariablesToLower(Module &M,
   return LocalVars;
 }
 
-SmallPtrSet<GlobalValue *, 32> getUsedList(Module &M) {
-  SmallPtrSet<GlobalValue *, 32> UsedList;
-
-  SmallVector<GlobalValue *, 32> TmpVec;
-  collectUsedGlobalVariables(M, TmpVec, true);
-  UsedList.insert(TmpVec.begin(), TmpVec.end());
-
-  TmpVec.clear();
-  collectUsedGlobalVariables(M, TmpVec, false);
-  UsedList.insert(TmpVec.begin(), TmpVec.end());
-
-  return UsedList;
-}
-
 } // end namespace AMDGPU
 
 } // end namespace llvm
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/Utils/AMDGPULDSUtils.h b/contrib/llvm-project/llvm/lib/Target/AMDGPU/Utils/AMDGPULDSUtils.h
index d1c9229bc336..83ef68cc3f60 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/Utils/AMDGPULDSUtils.h
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/Utils/AMDGPULDSUtils.h
@@ -22,44 +22,13 @@ class ConstantExpr;
 
 namespace AMDGPU {
 
-/// Collect reachable callees for each kernel defined in the module \p M and
-/// return collected callees at \p KernelToCallees.
-void collectReachableCallees(
-    Module &M,
-    DenseMap<Function *, SmallPtrSet<Function *, 8>> &KernelToCallees);
-
-/// For the given LDS global \p GV, visit all its users and collect all
-/// non-kernel functions within which \p GV is used and return collected list of
-/// such non-kernel functions.
-SmallPtrSet<Function *, 8> collectNonKernelAccessorsOfLDS(GlobalVariable *GV);
-
-/// Collect all the instructions where user \p U belongs to. \p U could be
-/// instruction itself or it could be a constant expression which is used within
-/// an instruction. If \p CollectKernelInsts is true, collect instructions only
-/// from kernels, otherwise collect instructions only from non-kernel functions.
-DenseMap<Function *, SmallPtrSet<Instruction *, 8>>
-getFunctionToInstsMap(User *U, bool CollectKernelInsts);
-
 bool isKernelCC(const Function *Func);
 
 Align getAlign(DataLayout const &DL, const GlobalVariable *GV);
 
-/// \returns true if a given global variable \p GV (or its global users) appear
-/// as an use within some instruction (either from kernel or from non-kernel).
-bool hasUserInstruction(const GlobalValue *GV);
-
-/// \returns true if an LDS global requires lowering to a module LDS structure
-/// if \p F is not given. If \p F is given it must be a kernel and function
-/// \returns true if an LDS global is directly used from that kernel and it
-/// is safe to replace its uses with a kernel LDS structure member.
-bool shouldLowerLDSToStruct(const GlobalVariable &GV,
-                            const Function *F = nullptr);
-
 std::vector<GlobalVariable *> findVariablesToLower(Module &M,
                                                    const Function *F = nullptr);
 
-SmallPtrSet<GlobalValue *, 32> getUsedList(Module &M);
-
 /// Replace all uses of constant \p C with instructions in \p F.
 void replaceConstantUsesInFunction(ConstantExpr *C, const Function *F);
 } // end namespace AMDGPU
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/VOPInstructions.td b/contrib/llvm-project/llvm/lib/Target/AMDGPU/VOPInstructions.td
index a3eccf13cd71..a8368892c565 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/VOPInstructions.td
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/VOPInstructions.td
@@ -794,6 +794,18 @@ class VOPPatGen<SDPatternOperator Op, VOPProfile P> {
   list<dag> ret =  [!con(Outs, (set Ins))];
 }
 
+class DivergentUnaryFrag<SDPatternOperator Op> : PatFrag <
+  (ops node:$src0),
+  (Op $src0),
+  [{ return N->isDivergent(); }]> {
+  // This check is unnecessary as it's captured by the result register
+  // bank constraint.
+  //
+  // FIXME: Should add a way for the emitter to recognize this is a
+  // trivially true predicate to eliminate the check.
+  let GISelPredicateCode = [{return true;}];
+}
+
 class VOPPatOrNull<SDPatternOperator Op, VOPProfile P> {
   list<dag> ret = !if(!ne(P.NeedPatGen,PatGenMode.NoPattern), VOPPatGen<Op, P>.ret, []);
 }